loncom/localize/transliterate.pm - annotate

Return to transliterate.pm CVS log
Up to [LON-CAPA] / loncom / localize
Annotation of loncom/localize/transliterate.pm, revision 1.2

1.1       raeburn     1: # The LearningOnline Network with CAPA
                      2: # Transliteration to ascii
                      3: #
1.2     ! raeburn     4: # $Id: transliterate.pm,v 1.1 2019/02/26 14:42:22 raeburn Exp $
1.1       raeburn     5: #
                      6: # Copyright Michigan State University Board of Trustees
                      7: #
                      8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
                      9: #
                     10: # LON-CAPA is free software; you can redistribute it and/or modify
                     11: # it under the terms of the GNU General Public License as published by
                     12: # the Free Software Foundation; either version 2 of the License, or
                     13: # (at your option) any later version.
                     14: #
                     15: # LON-CAPA is distributed in the hope that it will be useful,
                     16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     18: # GNU General Public License for more details.
                     19: #
                     20: # You should have received a copy of the GNU General Public License
                     21: # along with LON-CAPA; if not, write to the Free Software
                     22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     23: #
                     24: # /home/httpd/html/adm/gpl.txt
                     25: #
                     26: # http://www.lon-capa.org/
                     27: #
                     28: ######################################################################
                     29: ######################################################################
                     30: 
                     31: =pod
                     32: 
                     33: =head1 NAME
                     34: 
                     35: LONCAPA::transliterate - transliterate non-ascii characters
                     36: in filenames.
                     37: 
                     38: =head1 SYNOPSIS
                     39: 
                     40: When fed a filename it will replace instances of non-ascii
                     41: characters with transliterations.
                     42: 
                     43: =head1 OVERVIEW
                     44: 
                     45: Used to replace non-ascii character(s) with a transliteration
                     46: of the character(s) to ascii character(s).
                     47: 
                     48: If there are preferred replacements for a particular language
                     49: then those should be included in a separate subroutine which
                     50: is called before the transliteration of last resort (which is
                     51: done with Text::Unidecode).
                     52: 
                     53: =head1 SUBROUTINES
                     54: 
                     55: =cut
                     56: 
                     57: package LONCAPA::transliterate;
                     58: 
                     59: use strict;
                     60: use utf8;
                     61: use Text::Unidecode qw(unidecode);
                     62: use Encode qw(decode_utf8 encode_utf8);
1.2     ! raeburn    63: use Unicode::Normalize qw(normalize);
1.1       raeburn    64: 
                     65: =pod
                     66: 
                     67: =over
                     68: 
                     69: =item * fname_to_ascii()
                     70: 
                     71: Inputs: $fname (required), $language (optional)
                     72: 
                     73: Output: $fname
                     74: 
                     75: Replaces non-ascii characters with a transliteration
                     76: of the character to an ascii character (using Text::Unidecode) 
                     77: 
                     78: If the language code is de, transliteration via 
                     79: german_to_ascii() is used first to handle umlauts and eszett,
                     80: before using Text::Unidecode.
                     81: 
                     82: If other routines are added to support preferred transliteration
                     83: of non-ascii characters for specific languages, they should be
                     84: added as new subroutines to this file, and then called if the
                     85: language code has an appropriate value.
                     86: 
                     87: =back
                     88: 
                     89: =cut
                     90: 
                     91: sub fname_to_ascii {
                     92:     my ($fname,$language) = @_;
                     93:     if ($fname =~ /([^\x{00}-\x{7f}])/) {
1.2     ! raeburn    94:         $fname = decode_utf8($fname);
        !            95:         $fname = normalize('D',$fname);
        !            96:         $fname = normalize('C',$fname);
1.1       raeburn    97:         if ($language eq 'de') {
                     98:             $fname = &german_to_ascii($fname);
                     99:         }
                    100:         $fname = unidecode($fname);
1.2     ! raeburn   101:         $fname = encode_utf8($fname);
1.1       raeburn   102:     }
                    103:     return $fname;
                    104: }
                    105: 
                    106: =pod 
                    107: 
                    108: =over
                    109: 
                    110: =item * german_to_ascii()
                    111: 
                    112: Input: $fname (required)
                    113: 
                    114: Output: $fname
                    115: 
                    116: Replaces letters with umlauts with the equivalent letter
                    117: without an umlaut plus letter e. Case is preserved.
                    118: 
                    119: Replaces eszett with double s.
                    120: 
                    121: =back
                    122: 
                    123: =cut
                    124: 
                    125: sub german_to_ascii {
                    126:     my ($fname) = @_;
                    127:     my %characters = (
                    128:                        'Ä' => 'AE',
                    129:                        'Ö' => 'OE',
                    130:                        'Ü' => 'UE',
                    131:                        'ä' => 'ae',
                    132:                        'ö' => 'oe',
                    133:                        'ü' => 'ue',
                    134:                        'ß' => 'ss',
                    135:                       );
                    136:     $fname =~ s/([ÄäÖöÜüß])/$characters{$1}/g;
                    137:     return $fname;
                    138: }
                    139: 
                    140: 1;
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>