File:  [LON-CAPA] / loncom / homework / math_parser / Tokenizer.pm
Revision 1.3: download - view: text, annotated - select for diffs
Mon Mar 13 22:31:22 2023 UTC (14 months, 3 weeks ago) by raeburn
Branches: MAIN
CVS tags: version_2_12_X, version_2_11_4_msu, HEAD
- Add $Id$ line in comments for display of version.

    1: # The LearningOnline Network with CAPA - LON-CAPA
    2: # String tokenizer
    3: #
    4: # $Id: Tokenizer.pm,v 1.3 2023/03/13 22:31:22 raeburn Exp $
    5: #
    6: # Copyright (C) 2014 Michigan State University Board of Trustees
    7: #
    8: # This program is free software: you can redistribute it and/or modify
    9: # it under the terms of the GNU General Public License as published by
   10: # the Free Software Foundation, either version 3 of the License, or
   11: # (at your option) any later version.
   12: #
   13: # This program is distributed in the hope that it will be useful,
   14: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   15: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
   16: # GNU General Public License for more details.
   17: #
   18: # You should have received a copy of the GNU General Public License
   19: # along with this program. If not, see <http://www.gnu.org/licenses/>.
   20: #
   21: 
   22: ##
   23: # String tokenizer. Recognizes only names, numbers, and parser operators.
   24: ##
   25: package Apache::math_parser::Tokenizer;
   26: 
   27: use strict;
   28: use warnings;
   29: use utf8;
   30: 
   31: use aliased 'Apache::math_parser::Definitions';
   32: use aliased 'Apache::math_parser::ParseException';
   33: use aliased 'Apache::math_parser::Token';
   34: 
   35: ##
   36: # @constructor
   37: # @param {Definitions} defs - Operator definitions
   38: # @param {string} text - The text to tokenize
   39: ##
   40: sub new {
   41:     my $class = shift;
   42:     my $self = {
   43:         _defs => shift,
   44:         _text => shift,
   45:     };
   46:     bless $self, $class;
   47:     return $self;
   48: }
   49: 
   50: # Attribute helpers
   51: 
   52: ##
   53: # Operator definitions
   54: # @returns {Definitions}
   55: ##
   56: sub defs {
   57:     my $self = shift;
   58:     return $self->{_defs};
   59: }
   60: 
   61: ##
   62: # The text to tokenize
   63: # @returns {string}
   64: ##
   65: sub text {
   66:     my $self = shift;
   67:     return $self->{_text};
   68: }
   69: 
   70: 
   71: ##
   72: # Tokenizes the text.
   73: # Can throw a ParseException.
   74: # @returns {Token[]}
   75: ##
   76: sub tokenize {
   77:     my( $self ) = @_;
   78:     my( $text, $c, $i, $from, @tokens, $value );
   79:     my @operators = @{$self->defs->operators};
   80:     my $dec1 = Definitions->DECIMAL_SIGN_1;
   81:     my $dec2 = Definitions->DECIMAL_SIGN_2;
   82:     
   83:     $text = $self->text;
   84:     if (!defined $text) {
   85:         die "Math Tokenizer: undefined text";
   86:     }
   87:     if (!utf8::is_utf8($text)) {
   88:         utf8::decode($text);
   89:     }
   90:     $i = 0;
   91:     $c = $i < length($text) ? substr($text, $i, 1) : '';
   92:     @tokens = ();
   93:     
   94: main:
   95:     while ($c ne '') {
   96:         $from = $i;
   97:         
   98:         # ignore whitespace
   99:         if ($c le ' ') {
  100:             $i++;
  101:             $c = $i < length($text) ? substr($text, $i, 1) : '';
  102:             next;
  103:         }
  104:         
  105:         # check for numbers before operators
  106:         # (numbers starting with . will not be confused with the . operator)
  107:         if (($c ge '0' && $c le '9') ||
  108:                 (($c eq $dec1 || $c eq $dec2) &&
  109:                 (substr($text, $i+1, 1) ge '0' && substr($text, $i+1, 1) le '9'))) {
  110:             $value = '';
  111:             
  112:             if ($c ne $dec1 && $c ne $dec2) {
  113:                 $i++;
  114:                 $value .= $c;
  115:                 # Look for more digits.
  116:                 for (;;) {
  117:                     $c = $i < length($text) ? substr($text, $i, 1) : '';
  118:                     if ($c lt '0' || $c gt '9') {
  119:                         last;
  120:                     }
  121:                     $i++;
  122:                     $value .= $c;
  123:                 }
  124:             }
  125:             
  126:             # Look for a decimal fraction part.
  127:             if ($c eq $dec1 || $c eq $dec2) {
  128:                 $i++;
  129:                 $value .= $c;
  130:                 for (;;) {
  131:                     $c = $i < length($text) ? substr($text, $i, 1) : '';
  132:                     if ($c lt '0' || $c gt '9') {
  133:                         last;
  134:                     }
  135:                     $i++;
  136:                     $value .= $c;
  137:                 }
  138:             }
  139:             
  140:             # Look for an exponent part.
  141:             if ($c eq 'e' || $c eq 'E') {
  142:                 $i++;
  143:                 $value .= $c;
  144:                 $c = $i < length($text) ? substr($text, $i, 1) : '';
  145:                 if ($c eq '-' || $c eq '+') {
  146:                     $i++;
  147:                     $value .= $c;
  148:                     $c = $i < length($text) ? substr($text, $i, 1) : '';
  149:                 }
  150:                 if ($c lt '0' || $c gt '9') {
  151:                     # syntax error in number exponent
  152:                     die ParseException->new("Syntax error in number exponent.", $from, $i);
  153:                 }
  154:                 do {
  155:                     $i++;
  156:                     $value .= $c;
  157:                     $c = $i < length($text) ? substr($text, $i, 1) : '';
  158:                 } while ($c ge '0' && $c le '9');
  159:             }
  160:             
  161:             # Convert the string value to a number. If it is finite, then it is a good token.
  162:             my $n = eval "\$value =~ tr/".$dec1.$dec2."/../";
  163:             if (!($n == 9**9**9 || $n == -9**9**9 || ! defined( $n <=> 9**9**9 ))) {
  164:                 push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
  165:                 next;
  166:             } else {
  167:                 # syntax error in number
  168:                 die ParseException->new("Syntax error in number.", $from, $i);
  169:             }
  170:         }
  171:         
  172:         # check for operators before names (they could be confused with
  173:         # variables if they don't use special characters)
  174:         for (my $iop = 0; $iop < scalar(@operators); $iop++) {
  175:             my $op = $operators[$iop];
  176:             my $opid = $op->id;
  177:             if (substr($text, $i, length($opid)) eq $opid) {
  178:                 $i += length($op->id);
  179:                 $c = $i < length($text) ? substr($text, $i, 1) : '';
  180:                 push(@tokens, Token->new(Token->OPERATOR, $from, $i - 1, $op->id, $op));
  181:                 next main;
  182:             }
  183:         }
  184:         
  185:         # names
  186:         if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
  187:                 ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' || $c eq '°') {
  188:             $value = $c;
  189:             $i++;
  190:             for (;;) {
  191:                 $c = $i < length($text) ? substr($text, $i, 1) : '';
  192:                 if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
  193:                         ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' ||
  194:                         ($c ge '0' && $c le '9') || $c eq '_') {
  195:                     $value .= $c;
  196:                     $i++;
  197:                 } else {
  198:                     last;
  199:                 }
  200:             }
  201:             # "i" is turned into a NUMBER token
  202:             if ($value eq "i") {
  203:                 push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
  204:                 next;
  205:             }
  206:             push(@tokens, Token->new(Token->NAME, $from, $i - 1, $value));
  207:             next;
  208:         }
  209:         
  210:         # unrecognized operator
  211:         die ParseException->new("Unrecognized operator.", $from, $i);
  212:     }
  213:     return @tokens;
  214: }
  215: 
  216: 1;
  217: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>