File:  [LON-CAPA] / loncom / localize / localize / checksimilar_2files.pl
Revision 1.2: download - view: text, annotated - select for diffs
Tue Aug 3 13:00:21 2010 UTC (13 years, 9 months ago) by bisitz
Branches: MAIN
CVS tags: language_hyphenation_merge, language_hyphenation, HEAD, BZ4492-merge, BZ4492-feature_horizontal_radioresponse
Optimized detection of similar phrases.

    1: #!/usr/bin/perl
    2: # The LearningOnline Network with CAPA
    3: # $Id: checksimilar_2files.pl,v 1.2 2010/08/03 13:00:21 bisitz Exp $
    4: 
    5: use strict;
    6: use warnings;
    7: use utf8;
    8: use open ':utf8';
    9: 
   10: ####
   11: #### Checks, if there are similar keys in the two inputfiles.
   12: #### For example, check the current lang.pm (first input) and newphrases.
   13: #### So if there are similar keys you don't have to translate
   14: #### them again but use the old value and just modify it.
   15: #### IMPORTANT: Both inputfiles have to contain a hash %Lexicon (like lang.pm) !!!
   16: 
   17: 
   18: ####--------Subroutines--------#### 
   19: 
   20: sub read {
   21:     # Read file into memory
   22:     my $file = shift;
   23:     open(IN,$file) or die;
   24:     my %filecontent = ();
   25:     my $contents = join('',<IN>);
   26:     close(IN);
   27:     # Build hash with hash from file
   28:     my %Lexicon = ();
   29:     eval($contents.'; %filecontent=%Lexicon;');
   30:     if ($@ ne "") {
   31:         print "\nAn error occurred during the attempt to retrieve the translation hash.\n"
   32:              ."Error: ".$@."\n";
   33:         die;
   34:     }
   35:     return %filecontent;
   36: }
   37: 
   38: sub similarities{
   39:     my $text = shift;
   40:     $text =~ s/[.,\_\-?!:]//g;
   41:     return $text;
   42: }
   43: 
   44: 
   45: 
   46: sub CourseCommunity {
   47:     
   48:     my $text1 = shift;
   49:     my $text2 = shift;
   50:     
   51:     $text1 =~ s/courses/X001X/gi;
   52:     $text1 =~ s/communities/X001X/gi;    
   53:     $text1 =~ s/course/X002X/gi;
   54:     $text1 =~ s/community/X002X/gi;
   55:     $text2 =~ s/courses/X001X/gi;
   56:     $text2 =~ s/communities/X001X/gi;
   57:     $text2 =~ s/course/X002X/gi;
   58:     $text2 =~ s/community/X002X/gi;
   59: 
   60:     if(lc($text1) eq lc($text2)) {
   61:         return 1;
   62:     }
   63:     
   64:     return 0;
   65: }
   66: 
   67: 
   68: 
   69: ####--------Main Program--------####
   70: 
   71: my $file1 = $ARGV[0];  # Old language.pm
   72: my $file2 = $ARGV[1];  # New Phrases
   73: my %langOLD = &read($file1); #Hash with old phrases
   74: my %langNEW = &read($file2); #Hash with new phrases
   75: my $dlm; 
   76: my $count = 1; #Counter
   77: 
   78: open(OUT,'>similarities.txt') or die;
   79: 
   80: # For each new phrase, check if there is already a similar one
   81: while( my ($kNEW, $vNEW) = each %langNEW ) {
   82:     my $temp1 = $kNEW;
   83:     $temp1 = &similarities($temp1);
   84:    
   85:     while( my ($kOLD, $vOLD) = each %langOLD ) {
   86:         my $temp2 = $kOLD;
   87:         $temp2 = &similarities($temp2);
   88: 
   89:         #Check for similar punctuation (case insensitive) or
   90:         #similarity related to Course/Community 
   91:         if(lc($temp1) eq lc($temp2) || &CourseCommunity($temp1,$temp2)){
   92:             #Find delimiter for key and value
   93:             if (($kNEW=~/\'/) & ($kNEW=~/\"/)) {
   94:                 print " (Warning: Both, ' and \", occur!)";
   95:             }
   96:             if ($kNEW=~/\'/) {
   97: 	        $dlm = '"';
   98: 	    } else {
   99: 	        $dlm = "'";
  100: 	    }
  101:             print OUT (<<ENDNEW);
  102: #Old key: $kOLD
  103:    $dlm$kNEW$dlm
  104: => $dlm$vOLD$dlm,
  105: 
  106: ENDNEW
  107:             $count++;
  108: 
  109:         }
  110:     }
  111: }
  112: print("Finished. ".$count." similar expressions found!\n");
  113: 
  114: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>