File:  [LON-CAPA] / loncom / localize / transliterate.pm
Revision 1.2: download - view: text, annotated - select for diffs
Sat Mar 2 23:08:51 2019 UTC (5 years, 2 months ago) by raeburn
Branches: MAIN
CVS tags: version_2_12_X, version_2_11_X, version_2_11_4_uiuc, version_2_11_4_msu, version_2_11_4, version_2_11_3_uiuc, version_2_11_3_msu, version_2_11_3, HEAD
- Bug 6792
  Normalize unicode representations before transliteration.

    1: # The LearningOnline Network with CAPA
    2: # Transliteration to ascii
    3: #
    4: # $Id: transliterate.pm,v 1.2 2019/03/02 23:08:51 raeburn Exp $
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: #
   26: # http://www.lon-capa.org/
   27: #
   28: ######################################################################
   29: ######################################################################
   30: 
   31: =pod
   32: 
   33: =head1 NAME
   34: 
   35: LONCAPA::transliterate - transliterate non-ascii characters
   36: in filenames.
   37: 
   38: =head1 SYNOPSIS
   39: 
   40: When fed a filename it will replace instances of non-ascii
   41: characters with transliterations.
   42: 
   43: =head1 OVERVIEW
   44: 
   45: Used to replace non-ascii character(s) with a transliteration
   46: of the character(s) to ascii character(s).
   47: 
   48: If there are preferred replacements for a particular language
   49: then those should be included in a separate subroutine which
   50: is called before the transliteration of last resort (which is
   51: done with Text::Unidecode).
   52: 
   53: =head1 SUBROUTINES
   54: 
   55: =cut
   56: 
   57: package LONCAPA::transliterate;
   58: 
   59: use strict;
   60: use utf8;
   61: use Text::Unidecode qw(unidecode);
   62: use Encode qw(decode_utf8 encode_utf8);
   63: use Unicode::Normalize qw(normalize);
   64: 
   65: =pod
   66: 
   67: =over
   68: 
   69: =item * fname_to_ascii()
   70: 
   71: Inputs: $fname (required), $language (optional)
   72: 
   73: Output: $fname
   74: 
   75: Replaces non-ascii characters with a transliteration
   76: of the character to an ascii character (using Text::Unidecode) 
   77: 
   78: If the language code is de, transliteration via 
   79: german_to_ascii() is used first to handle umlauts and eszett,
   80: before using Text::Unidecode.
   81: 
   82: If other routines are added to support preferred transliteration
   83: of non-ascii characters for specific languages, they should be
   84: added as new subroutines to this file, and then called if the
   85: language code has an appropriate value.
   86: 
   87: =back
   88: 
   89: =cut
   90: 
   91: sub fname_to_ascii {
   92:     my ($fname,$language) = @_;
   93:     if ($fname =~ /([^\x{00}-\x{7f}])/) {
   94:         $fname = decode_utf8($fname);
   95:         $fname = normalize('D',$fname);
   96:         $fname = normalize('C',$fname);
   97:         if ($language eq 'de') {
   98:             $fname = &german_to_ascii($fname);
   99:         }
  100:         $fname = unidecode($fname);
  101:         $fname = encode_utf8($fname);
  102:     }
  103:     return $fname;
  104: }
  105: 
  106: =pod 
  107: 
  108: =over
  109: 
  110: =item * german_to_ascii()
  111: 
  112: Input: $fname (required)
  113: 
  114: Output: $fname
  115: 
  116: Replaces letters with umlauts with the equivalent letter
  117: without an umlaut plus letter e. Case is preserved.
  118: 
  119: Replaces eszett with double s.
  120: 
  121: =back
  122: 
  123: =cut
  124: 
  125: sub german_to_ascii {
  126:     my ($fname) = @_;
  127:     my %characters = (
  128:                        'Ä' => 'AE',
  129:                        'Ö' => 'OE',
  130:                        'Ü' => 'UE',
  131:                        'ä' => 'ae',
  132:                        'ö' => 'oe',
  133:                        'ü' => 'ue',
  134:                        'ß' => 'ss',
  135:                       );
  136:     $fname =~ s/([ÄäÖöÜüß])/$characters{$1}/g;
  137:     return $fname;
  138: }
  139: 
  140: 1;

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>