File:  [LON-CAPA] / loncom / localize / transliterate.pm
Revision 1.1: download - view: text, annotated - select for diffs
Tue Feb 26 14:42:22 2019 UTC (5 years, 3 months ago) by raeburn
Branches: MAIN
CVS tags: HEAD
- Bug 6792
  - Replace a non-ascii character in the filename of uploaded file with an
    appropriate ascii character (if available).
  - If lonnet::clean_filename() reduces filename to .extension, prepend
    timestamp_milliseconds.

    1: # The LearningOnline Network with CAPA
    2: # Transliteration to ascii
    3: #
    4: # $Id: transliterate.pm,v 1.1 2019/02/26 14:42:22 raeburn Exp $
    5: #
    6: # Copyright Michigan State University Board of Trustees
    7: #
    8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
    9: #
   10: # LON-CAPA is free software; you can redistribute it and/or modify
   11: # it under the terms of the GNU General Public License as published by
   12: # the Free Software Foundation; either version 2 of the License, or
   13: # (at your option) any later version.
   14: #
   15: # LON-CAPA is distributed in the hope that it will be useful,
   16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18: # GNU General Public License for more details.
   19: #
   20: # You should have received a copy of the GNU General Public License
   21: # along with LON-CAPA; if not, write to the Free Software
   22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23: #
   24: # /home/httpd/html/adm/gpl.txt
   25: #
   26: # http://www.lon-capa.org/
   27: #
   28: ######################################################################
   29: ######################################################################
   30: 
   31: =pod
   32: 
   33: =head1 NAME
   34: 
   35: LONCAPA::transliterate - transliterate non-ascii characters
   36: in filenames.
   37: 
   38: =head1 SYNOPSIS
   39: 
   40: When fed a filename it will replace instances of non-ascii
   41: characters with transliterations.
   42: 
   43: =head1 OVERVIEW
   44: 
   45: Used to replace non-ascii character(s) with a transliteration
   46: of the character(s) to ascii character(s).
   47: 
   48: If there are preferred replacements for a particular language
   49: then those should be included in a separate subroutine which
   50: is called before the transliteration of last resort (which is
   51: done with Text::Unidecode).
   52: 
   53: =head1 SUBROUTINES
   54: 
   55: =cut
   56: 
   57: package LONCAPA::transliterate;
   58: 
   59: use strict;
   60: use utf8;
   61: use Text::Unidecode qw(unidecode);
   62: use Encode qw(decode_utf8 encode_utf8);
   63: 
   64: =pod
   65: 
   66: =over
   67: 
   68: =item * fname_to_ascii()
   69: 
   70: Inputs: $fname (required), $language (optional)
   71: 
   72: Output: $fname
   73: 
   74: Replaces non-ascii characters with a transliteration
   75: of the character to an ascii character (using Text::Unidecode) 
   76: 
   77: If the language code is de, transliteration via 
   78: german_to_ascii() is used first to handle umlauts and eszett,
   79: before using Text::Unidecode.
   80: 
   81: If other routines are added to support preferred transliteration
   82: of non-ascii characters for specific languages, they should be
   83: added as new subroutines to this file, and then called if the
   84: language code has an appropriate value.
   85: 
   86: =back
   87: 
   88: =cut
   89: 
   90: sub fname_to_ascii {
   91:     my ($fname,$language) = @_;
   92:     if ($fname =~ /([^\x{00}-\x{7f}])/) {
   93:         $fname=&decode_utf8($fname);
   94:         if ($language eq 'de') {
   95:             $fname = &german_to_ascii($fname);
   96:         }
   97:         $fname = unidecode($fname);
   98:         $fname=&encode_utf8($fname);
   99:     }
  100:     return $fname;
  101: }
  102: 
  103: =pod 
  104: 
  105: =over
  106: 
  107: =item * german_to_ascii()
  108: 
  109: Input: $fname (required)
  110: 
  111: Output: $fname
  112: 
  113: Replaces letters with umlauts with the equivalent letter
  114: without an umlaut plus letter e. Case is preserved.
  115: 
  116: Replaces eszett with double s.
  117: 
  118: =back
  119: 
  120: =cut
  121: 
  122: sub german_to_ascii {
  123:     my ($fname) = @_;
  124:     my %characters = (
  125:                        'Ä' => 'AE',
  126:                        'Ö' => 'OE',
  127:                        'Ü' => 'UE',
  128:                        'ä' => 'ae',
  129:                        'ö' => 'oe',
  130:                        'ü' => 'ue',
  131:                        'ß' => 'ss',
  132:                       );
  133:     $fname =~ s/([ÄäÖöÜüß])/$characters{$1}/g;
  134:     return $fname;
  135: }
  136: 
  137: 1;

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>