File:  [LON-CAPA] / loncom / interface / entities.pm
Revision 1.1: download - view: text, annotated - select for diffs
Mon Feb 11 11:35:46 2008 UTC (16 years, 3 months ago) by foxr
Branches: MAIN
CVS tags: HEAD
Building up hash of entity -> latex translations to sanitize
lonprintout.pm's character_table sub and to
1. Make it easy to add new entities.
2. Add some entities that are not in the table.
3. I think this version will also run faster.

    1: # The LearningOnline Network
    2: # entity -> tex.
    3: #
    4: # $Id:
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: # http://www.lon-capa.org/
   26: #
   27: #
   28: package Apache::entities;
   29: use strict;
   30: #
   31: #   This file contains a table driven entity-->latex converter.
   32: #
   33: #  Assumptions:
   34: #   The number of entities in a resource is small compared with the
   35: #   number of possible entities that might be translated.
   36: #   Therefore the strategy is to match a general entity pattern
   37: #   &.+; over and over, pull out the match look it up in an entity -> tex hash
   38: #   and do the replacement.
   39: #
   40: #  In order to simplify the hash, the following reductions are done:
   41: #   &#d+; have the &# and ; stripped and is converted to an int.
   42: #   &#.+; have the &#x and ; stripped and is converted to an int as a hex
   43: #                             value.
   44: #   All others have the & and ; stripped.
   45: 
   46: 
   47: #  The hash:  Add new conversions here; leave off the leading & and the trailing ;
   48: #  all numeric entities need only appear as their decimal versions
   49: #  (e.g. no need for 1234 is sufficient, no need for 0x4d2 as well.
   50: #
   51: #  This entity table is mercilessly cribbed from the  HTML pocket reference
   52: #  table starting at pg 82.  In most cases the LaTeX equivalent codes come from
   53: #  the original massive regular expression replacements originally by 
   54: #  A. Sakharuk in lonprintout.pm
   55: #
   56: #  Note numerical entities are essentially unicode character codes.
   57: #
   58: my %entities = {
   59: 
   60:     #  ---- ASCII code page: ----------------
   61: 
   62:     # Translation to empty strings:
   63: 
   64:     7        => "",
   65:     9        => "",
   66:     10       => "",
   67:     13       => "",
   68:     
   69:     # Translations to simple characters:
   70: 
   71:     32       => " ",
   72:     33       => "!",
   73:     34       => '"',
   74:     'quot'   => '"',
   75:     35       => '\\\#',
   76:     36       => '\\\$',
   77:     37       => '\\%',
   78:     38       => '\\&',
   79:     'amp'    => '\\&',
   80:     39       => '\'',		# Apostrophe
   81:     40       => '(',
   82:     41       => ')',
   83:     42       => '\*',
   84:     43       => '\+',
   85:     44       => ',',		#  comma
   86:     45       => '-',
   87:     46       => '\.',
   88:     47       => '\/',
   89:     48       => '0',
   90:     49       => '1',
   91:     50       => '2',
   92:     51       => '3',
   93:     52       => '4',
   94:     53       => '5',
   95:     54       => '6',
   96:     55       => '7',
   97:     56       => '8',
   98:     57       => '9',
   99:     58       => ':',
  100:     59       => ';',
  101:     60       => '\\ensuremath\{<\}',
  102:     'lt'     => '\\ensuremath\{<\}',
  103:     61       => '\\ensuremath\{=\}',
  104:     62       => '\\ensuremath\{>\}',
  105:     'gt'     => '\\ensuremath\{>\}',
  106:     63       => '\?',
  107:     64       => '@',
  108:     65       => 'A',
  109:     66       => 'B',
  110:     67       => 'C',
  111:     68       => 'D',
  112:     69       => 'E',
  113:     70       => 'F',
  114:     71       => 'G',
  115:     72       => 'H',
  116:     73       => 'I',
  117:     74       => 'J',
  118:     75       => 'K',
  119:     76       => 'L',
  120:     77       => 'M',
  121:     78       => 'N',
  122:     79       => 'O',
  123:     80       => 'P',
  124:     81       => 'Q',
  125:     82       => 'R',
  126:     83       => 'S',
  127:     84       => 'T',
  128:     85       => 'U',
  129:     86       => 'V',
  130:     87       => 'W',
  131:     88       => 'X',
  132:     89       => 'Y',
  133:     90       => 'Z',
  134:     91       => '[',
  135:     92       => '\\ensuremath\{\\setminus\}', # \setminus is \ with special spacing.
  136:     93       => ']',
  137:     94       => '\\ensuremath\{\\wedge\}',
  138:     95       => '\\underline\{\\makebox[2mm]\\{\\strut\}\}', # Underline 2mm of space for _
  139:     96       => '`',
  140:     97       => 'a',
  141:     98       => 'b',
  142:     99       => 'c',
  143:     100      => 'd',
  144:     101      => 'e',
  145:     102      => 'f',
  146:     103      => 'g',
  147:     104      => 'h', 
  148:     105      => 'i',
  149:     106      => 'j',
  150:     107      => 'k',
  151:     108      => 'l',
  152:     109      => 'm',
  153:     110      => 'n',
  154:     111      => 'o',
  155:     112      => 'p',
  156:     113      => 'q',
  157:     114      => 'r',
  158:     115      => 's',
  159:     116      => 't',
  160:     117      => 'u',
  161:     118      => 'v',
  162:     119      => 'w',
  163:     120      => 'x',
  164:     121      => 'y',
  165:     122      => 'z',
  166:     123      => '\\{',
  167:     124      => '\|',
  168:     125      => '\\}',
  169:     126      => '\~',
  170: 
  171:     #   Controls and Latin-1 supplement.  Note that some entities that have
  172:     #   visible effect are not printing unicode characters.  Specifically
  173:     #   &#130;-&#160;
  174: 
  175:     130     => ',',
  176:     131     => '\\textflorin ',
  177:     132     => ',,',		# Low double left quotes.
  178:     133     => '\\ensuremat\{\\ldots\}',
  179:     134     => '\\ensuremath\{\\dagger\}',
  180:     135     => '\\ensuremath\{\\ddagger\}',
  181:     136     => '\\ensuremath\{\\wedge\}',
  182:     137     => '\\textperthousand ',
  183:     138     => '\\v\{S\}',
  184:     139     => '\\ensuremath\{<\}',
  185:     140     => '\{\\OE\}',
  186:     
  187:     #  There's a gap here in my entity table
  188: 
  189:     145     => '\`',
  190:     146     => '\'',
  191:     147     => '\`\`',
  192:     148     => '\'\'',
  193:     149     => '\\ensuremath\{\\bullet\}',
  194:     150     => '--',
  195:     151     => '---',
  196:     152     => '\\ensuremath\{\\sim\}',
  197:     153     => '\\texttrademark',
  198:     154     => '\\v\{s\}',
  199:     155     => '\\ensuremath\{>\}',
  200:     156     => '\\oe ',
  201:     
  202:     # Another short gap:
  203: 
  204:     159     => '\\"Y',
  205:     160     => '~',
  206:     'nbsp'  => '~',
  207:     161     => '\\textexclamdown ',
  208:     'iexcl' => '\\textexclamdown ',
  209:     162     => '\\textcent ',
  210:     'cent'  => '\\textcent ',
  211:     163     => '\\pounds ',
  212:     'pound' => '\\pounds ',
  213:     164     => '\\textcurrency ',
  214:     'curren' => '\\textcurrency ',
  215:     165     => '\\textyen ',
  216:     'yen'   => '\\textyen ',
  217:     166     => '\\textbrokenbar ',
  218:     'brvbar' => '\\textbrokenbar ',
  219:     167     => '\\textsection ',
  220:     'sect'  => '\\textsection ',
  221:     168     => '\\texthighdieresis ',
  222:     'uml'   => '\\texthighdieresis ',
  223:     169     => '\\copyright ',
  224:     'copy'  => '\\copyright ',
  225:     170     => '\\textordfeminine ',
  226:     'ordf'  => '\\textordfeminine ',
  227:     171     => '\\ensuremath\{\ll\}', # approximation of left angle quote.
  228:     'laquo' => '\\ensuremath\{\ll\}', #   ""
  229:     172     => '\\ensuremath\{\\neg\}',
  230:     'not'   => '\\ensuremath\{\\neg\}',
  231:     173     => ' - ',
  232:     'shy'   => ' - ',
  233:     174     => '\\textregistered ',
  234:     'reg'   => '\\textregistered ',
  235:     175     => '\\ensuremath\{^\{-\}\}',
  236:     'macr'  => '\\ensuremath\{^\{-\}\}',
  237:     176     => '\\ensuremath\{^\{\\circ\}\}',
  238:     'deg'   => '\\ensuremath\{^\{\\circ\}\}',
  239:     177     => '\\ensuremath\{\\pm\}',
  240:     'plusmn' => '\\ensuremath\{\\pm\}',
  241:     178     => '\\ensuremath\{^2\}',
  242:     'sup2'  => '\\ensuremath\{^2\}',
  243:     179     => '\\ensuremath\{^3\}',
  244:     'sup3'  => '\\ensuremath\{^3\}',
  245:     180     => '\\textacute ',
  246:     'acute' => '\\textacute ',
  247:     181     => '\\ensuremath\{\\mu\}',
  248:     'micro' => '\\ensuremath\{\\mu\}',
  249:     182     => '\\P ',
  250:     para    => '\\P ',
  251:     183     => '\\ensuremath\{\\cdot\}',
  252:     'middot' => '\\ensuremath\{\\cdot\}',
  253:     184     => '\\c\{\\strut\}',
  254:     'cedil' => '\\c\{\\strut\}',
  255:     185     => '\\ensuremath\{^1\}',
  256:     sup1    => '\\ensuremath\{^1\}',
  257:     186     => '\\textordmasculine ',
  258:     'ordm'  => '\\textordmasculine ',
  259:     187     => '\\ensuremath\{\\gg\}',
  260:     'raquo' => '\\ensuremath\{\\gg\}',
  261:     188     => '\\textonequarter ',
  262:     'frac14' => '\\textonequarter ',
  263:     189     => '\\textonehalf' ,
  264:     'frac12' => '\\textonehalf' ,
  265:     190     => '\\textthreequarters ',
  266:     'frac34' => '\\textthreequarters ',
  267:     191     =>  '\\textquestiondown ',
  268:     'iquest' => '\\textquestiondown ',
  269:     192     => '\\\`\{A\}',
  270:     'Agrave' => '\\\`\{A\}',
  271:     193     => '\\\'\{A\}',
  272:     'Aacute' => '\\\'\{A\}',
  273:     194     => '\\^\{A\}',
  274:     'Acirc' => '\\^\{A\}',
  275:     195     => '\\~{A}',
  276:     'Atilde'=> '\\~{A}',
  277:     196     => '\\\"{A}',
  278:     'Auml'  => '\\\"{A}',
  279:     197     => '{\\AA}',
  280:     'Aring' => '{\\AA}',
  281:     198     => '{\\AE}',
  282:     'AElig' => '{\\AE}',
  283:     199     => '\\c{c}',
  284:     'Ccedil'=> '\\c{c}',
  285:     '200'   => '\\\`{E}',
  286:     'Egrave'=> '\\\`{E}',
  287:     201     => '\\\'{E}',
  288:     'Eacute'=> '\\\'{E}',
  289:     202     => '\\\^{E}',
  290:     'Ecirc' => '\\\^{E}',
  291:     203     => '\\\"{E}',
  292:     'Euml'  => '\\\"{E}',
  293:     204     => '\\\`{I}',
  294:     'Igrave'=> '\\\`{I}',
  295:     205     => '\\\'{I}',
  296:     'Iacute'=> '\\\'{I}',
  297:     206     => '\\\^{I}',
  298:     'Icirc' => '\\\^{I}',
  299:     207     => '\\\"{I}',
  300:     'Iuml'  => '\\\"{I}',
  301:     208     => '\\OE',
  302:     'ETH'   => '\\OE',
  303:     209     => '\\~{N}',
  304:     'Ntilde'=> '\\~{N}',
  305:     210     => '\\\`{O}',
  306:     'Ograve'=> '\\\`{O}',
  307:     211     => '\\\'{O}',
  308:     'Oacute'=> '\\\'{O}',
  309:     212     => '\\\^{O}',
  310:     'Ocirc' => '\\\^{O}',
  311:     213     => '\\~{O}',
  312:     'Otilde'=> '\\~{O}',
  313:     214     => '\\\"{O}',
  314:     'Ouml'  => '\\\"{O}',
  315:     215     => '\\ensuremath\{\\times\}',
  316:     'times' => '\\ensuremath\{\\times\}',
  317:     216     => '\\O',
  318:     'Oslash'=> '\\O',
  319:     217     => '\\\`{U}',
  320:     'Ugrave'=> '\\\`{U}',
  321:     218     => '\\\'{U}',
  322:     'Uacute'=> '\\\'{U}',
  323:     219     => '\\\^{U}',
  324:     'Ucirc' => '\\\^{U}',
  325:     220     => '\\\"{U}',
  326:     'Uuml'  => '\\\"{U}',
  327:     221     => '\\\'{Y}',
  328:     'Yacute'=> '\\\'{Y}',
  329:     222     => '\\TH',
  330:     'THORN' => '\\TH',
  331:     223     => '{\\sz}',
  332:     'szlig' => '{\\sz}',
  333:     224     => '\\\`{a}',
  334:     'agrave'=> '\\\`{a}',
  335:     225     => '\\\'{a}',
  336:     'aacute'=> '\\\'{a}',
  337:     226     => '\\\^{a}',
  338:     'acirc' => '\\\^{a}',
  339:     227     => '\\\~{a}',
  340:     'atilde'=> '\\\~{a}',
  341:     228     => '\\\"{a}',
  342:     'auml'  => '\\\"{a}',
  343:     229     => '\\aa',
  344:     'aring' => '\\aa',
  345:     230     => '\\ae',
  346:     'aelig' => '\\ae',
  347:     231     => '\\c{c}',
  348:     'ccedil'=> '\\c{c}',
  349:     232     => '\\\`{e}',
  350:     'egrave'=> '\\\`{e}',
  351:     233     => '\\\'{e}',
  352:     'eacute'=> '\\\'{e}',
  353:     234     => '\\\^{e}',
  354:     'ecirc' => '\\\^{e}',
  355:     235     => '\\\"{e}',
  356:     'euml'  => '\\\"{e}',
  357:     236     => '\\\`{i}',
  358:     'igrave'=> '\\\`{i}',
  359:     237     => '\\\'{i}',
  360:     'iacute'=> '\\\'{i}',
  361:     238     => '\\\^{i}',
  362:     'icirc' => '\\\^{i}',
  363:     239     => '\\\"{i}',
  364:     'iuml'  => '\\\"{i}',
  365:     240     => '\\dh',
  366:     'eth'   => '\\dh',
  367:     241     => '\\\~{n}',
  368:     'ntilde'=> '\\\~{n}',
  369:     242     => '\\\`{o}',
  370:     'ograve'=> '\\\`{o}',
  371:     243     => '\\\'{o}',
  372:     'oacute'=> '\\\'{o}',
  373:     244     => '\\\^{o}',
  374:     'ocirc' => '\\\^{o}',
  375:     245     => '\\\~{o}',
  376:     'otilde'=> '\\\~{o}',
  377:     246     => '\\\"{o}',
  378:     'ouml'  => '\\\"{o}',
  379:     247     => '\\ensuremath\{\\div\}',
  380:     'divide'=> '\\ensuremath\{\\div\}',
  381:     248     => '{\\o}',
  382:     'oslash'=> '{\\o}',
  383:     249     => '\\\`{u}',
  384:     'ugrave'=> '\\\`{u}',
  385:     250     => '\\\'{u}',
  386:     'uacute'=> '\\\'{u}',
  387:     251     => '\\\^{u}',
  388:     'ucirc' => '\\\^{u}',
  389:     252     => '\\\"{u}',
  390:     'uuml'  => '\\\"{u}',
  391:     253     => '\\\'{y}',
  392:     'yacute'=> '\\\'{y}',
  393:     254     => '\\th',
  394:     'thorn' => '\\th',
  395:     255     => '\\\"{y}',
  396:     'yuml'  => '\\\"{y}',
  397: 
  398:     # hbar entity number comes from the unicode charater:
  399:     # see e.g. http://www.unicode.org/charts/PDF/U0100.pdf
  400:     # ISO also documents a 'planck' entity.
  401: 
  402:     295     => '\\ensuremath\{\hbar\}',
  403:     'plank' => '\\ensuremath\{\hbar\}',
  404: 
  405:     # Latin extended-A HTML 4.01 entities:
  406: 
  407:     338      => '\\OE',
  408:     'OElig'  => '\\OE',
  409:     339      => '\\oe',
  410:     'oelig'  => '\\oe',
  411:     352      => '\\v{S}',
  412:     'Scaron' => '\\v{S}',
  413:     353      => '\\v{s}',
  414:     'scaron' => '\\v{s}',
  415:     376      => '\\\"{Y}',
  416:     'Yuml'   => '\\\"{Y}', 
  417: 
  418: 
  419:     # Latin extended B HTML 4.01 entities
  420: 
  421:     402      => '\\ensuremath{f}',
  422:     'fnof'   => '\\ensuremath{f}',
  423: 
  424:     # Spacing modifier letters:
  425:     
  426:     710      => '\^{}',
  427:     'circ'   => '\^{}',
  428:     732      => '\~{}',
  429:     'tilde'  => '\~{}',
  430: 
  431:     # Greek uppercase:
  432: 
  433:     913      => '\\ensuremath\{\\mathrm\{A\}\}',
  434:     'Alpha'  => '\\ensuremath\{\\mathrm\{A\}\}',
  435:     914      => '\\ensuremath\{\\mathrm\{B\}\}',
  436:     'Beta'   => '\\ensuremath\{\\mathrm\{B\}\}',
  437:     915      => '\\ensuremath\{\\Gamma\}',
  438:     'Gamma'  => '\\ensuremath\{\\Gamma\}',
  439:     916      => '\\ensuremath\{\\Delta\}',
  440:     'Delta'  => '\\ensuremath\{\\Delta\}',
  441:     917      => '\\ensuremath\{\\mathrm\{E\}\}',
  442:     'Epsilon'=> '\\ensuremath\{\\mathrm\{E\}\}',
  443:     918      => '\\ensuremath\{\\mathrm\{Z\}\}',
  444:     'Zeta'   => '\\ensuremath\{\\mathrm\{Z\}\}',
  445:     919      => '\\ensuremath\{\\mathrm\{H\}\}',
  446:     'Eta'    => '\\ensuremath\{\\mathrm\{H\}\}',
  447:     920      => '\\ensuremath\{\\Theta\}',
  448:     'Theta'  => '\\ensuremath\{\\Theta\}',
  449:     921      => '\\ensuremath\{\\mathrm\{I\}\}',
  450:     'Iota'   => '\\ensuremath\{\\mathrm\{I\}\}',
  451:     922      => '\\ensuremath\{\\mathrm\{K\}\}',
  452:     'Kappa'  => '\\ensuremath\{\\mathrm\{K\}\}',
  453:     923      => '\\ensuremath\{\\Lambda\}',
  454:     'Lambda' => '\\ensuremath\{\\Lambda\}',
  455:     924      => '\\ensuremath\{\\mathrm\{M\}\}',
  456:     'Mu'     => '\\ensuremath\{\\mathrm\{M\}\}',
  457:     925      => '\\ensuremath\{\\mathrm\{N\}\}',
  458:     'Nu'     => '\\ensuremath\{\\mathrm\{N\}\}',
  459:     926      => '\\ensuremath\{\\mathrm\{\\Xi\}',
  460:     'Xi'     => '\\ensuremath\{\\mathrm\{\\Xi\}',
  461:     927      => '\\ensuremath\{\\mathrm\{O\}\}',
  462:     'Omicron'=> '\\ensuremath\{\\mathrm\{O\}\}',
  463:     928      => '\\ensuremath\{\\Pi\}',
  464:     'Pi'     => '\\ensuremath\{\\Pi\}',
  465:     929      => '\\ensuremath\{\\mathrm\{P\}\}',
  466:     'Rho'    => '\\ensuremath\{\\mathrm\{P\}\}',
  467:    
  468:     # Skips 930
  469: 
  470:     931      => '\\ensuremath\{\Sigma\}',
  471:     'Sigma'  => '\\ensuremath\{\Sigma\}',
  472:     932      => '\\ensuremath\{\\mathrm\{T\}\}',
  473:     'Tau'    => '\\ensuremath\{\\mathrm\{T\}\}',
  474:     933      => '\\ensuremath\{\\Upsilon\}',
  475:     'Upsilon'=> '\\ensuremath\{\\Upsilon\}',
  476:     934      => '\\ensuremath\{\\Phi\}',
  477:     'Phi'    => '\\ensuremath\{\\Phi\}',
  478:     935      => '\\ensuremath\{\\mathrm\{X\}\}',
  479:     'Chi'    => '\\ensuremath\{\\mathrm\{X\}\}',
  480:     936      => '\\ensuremath\{\\Psi\}',
  481:     'Psi'    => '\\ensuermath\{\\Psi\}',
  482:     937      => '\\ensuremath\{\\Omega\}',
  483:     'Omega'  => '\\ensuremath\{\\Omega\}',
  484: 
  485: 
  486:     # Greek lowercase:
  487: 
  488:     945      => '\\ensuremath\{\\alpha\}',
  489:     'alpha'  => '\\ensuremath\{\\alpha\}',
  490:     946      => '\\ensuremath\{\\beta\}',
  491:     'beta'   => '\\ensuremath\{\\beta\}',
  492:     947      => '\\ensuremath\{\\gamma\}',
  493:     'gamma'  => '\\ensuremath\{\\gamma\}',
  494:     948      => '\\ensuremath\{\\delta\}',
  495:     'delta'  => '\\ensuremath\{\\delta\}',
  496:     949      => '\\ensuremath\{\\epsilon\}',
  497:     'epsilon'=> '\\ensuremath\{\\epsilon\}',
  498:     950      => '\\ensuremath\{\\zeta\}',
  499:     'zeta'   => '\\ensuremath\{\\zeta\}',
  500:     951      => '\\ensuremath\{\\eta\}',
  501:     'eta'    => '\\ensuremath\{\\eta\}',
  502: 
  503:     
  504: };

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>