Annotation of loncom/localize/localize/checksimilar_2files.pl, revision 1.8

1.1       wenzelju    1: #!/usr/bin/perl
                      2: # The LearningOnline Network with CAPA
1.8     ! bisitz      3: # $Id: checksimilar_2files.pl,v 1.7 2013/09/25 13:22:42 bisitz Exp $
1.1       wenzelju    4: 
                      5: use strict;
                      6: use warnings;
                      7: use utf8;
                      8: use open ':utf8';
                      9: 
                     10: ####
                     11: #### Checks, if there are similar keys in the two inputfiles.
                     12: #### For example, check the current lang.pm (first input) and newphrases.
                     13: #### So if there are similar keys you don't have to translate
                     14: #### them again but use the old value and just modify it.
                     15: #### IMPORTANT: Both inputfiles have to contain a hash %Lexicon (like lang.pm) !!!
                     16: 
                     17: 
                     18: ####--------Subroutines--------#### 
                     19: 
                     20: sub read {
                     21:     # Read file into memory
                     22:     my $file = shift;
1.6       bisitz     23:     open(IN,$file) or die "Error: Could not open file: $file\n";
1.1       wenzelju   24:     my %filecontent = ();
                     25:     my $contents = join('',<IN>);
                     26:     close(IN);
                     27:     # Build hash with hash from file
                     28:     my %Lexicon = ();
                     29:     eval($contents.'; %filecontent=%Lexicon;');
                     30:     if ($@ ne "") {
1.6       bisitz     31:         die "\nAn error occurred during the attempt to retrieve the translation hash.\n"
1.1       wenzelju   32:              ."Error: ".$@."\n";
                     33:     }
                     34:     return %filecontent;
                     35: }
                     36: 
1.4       bisitz     37: sub similar_chars {
1.1       wenzelju   38:     my $text = shift;
1.8     ! bisitz     39:     $text =~ s/<\/*\w+ *\/*>//g; # HTML tags
1.4       bisitz     40:     $text =~ s/\[_\d\]//g; # translation parameters
1.7       bisitz     41:     $text =~ s/[.,\_\-?!: \/\(\)]//g; # punctuation
1.1       wenzelju   42:     return $text;
                     43: }
                     44: 
                     45: 
                     46: 
1.4       bisitz     47: sub similar_phrases {
1.1       wenzelju   48:     
                     49:     my $text1 = shift;
                     50:     my $text2 = shift;
                     51:     
1.5       bisitz     52:     my %phrases = (
                     53:         'courses'     => 1,
                     54:         'communities' => 1,
                     55:         'course'      => 2,
                     56:         'community'   => 2,
                     57:         'member'      => 3,
                     58:         'student'     => 3,
                     59:         'students'    => 3,
                     60:         'construction'=> 4,
                     61:         'authoring'   => 4,
                     62:     );
                     63: 
                     64:     foreach my $word (keys %phrases) {
1.6       bisitz     65:         $text1 =~ s/$word/X$phrases{$word}X/gi;
                     66:         $text2 =~ s/$word/X$phrases{$word}X/gi;
1.5       bisitz     67:     }
1.1       wenzelju   68: 
1.4       bisitz     69:     if (lc($text1) eq lc($text2)) {
1.1       wenzelju   70:         return 1;
                     71:     }
                     72:     
                     73:     return 0;
                     74: }
                     75: 
                     76: 
                     77: 
                     78: ####--------Main Program--------####
                     79: 
1.6       bisitz     80: if (!$ARGV[0] or !$ARGV[1]) {
                     81:     die 'Error: Invalid files! Please specify two files which should be checked.'."\n";
                     82: }
                     83: 
1.1       wenzelju   84: my $file1 = $ARGV[0];  # Old language.pm
                     85: my $file2 = $ARGV[1];  # New Phrases
1.3       bisitz     86: 
                     87: print("Checking for similar expressions in phrases in $file1 and $file2...\n");
                     88: 
1.1       wenzelju   89: my %langOLD = &read($file1); #Hash with old phrases
                     90: my %langNEW = &read($file2); #Hash with new phrases
                     91: my $dlm; 
1.3       bisitz     92: my $count = 0;
1.1       wenzelju   93: 
                     94: # For each new phrase, check if there is already a similar one
                     95: while( my ($kNEW, $vNEW) = each %langNEW ) {
                     96:     my $temp1 = $kNEW;
1.4       bisitz     97:     $temp1 = &similar_chars($temp1);
1.1       wenzelju   98:    
                     99:     while( my ($kOLD, $vOLD) = each %langOLD ) {
                    100:         my $temp2 = $kOLD;
1.4       bisitz    101:         $temp2 = &similar_chars($temp2);
1.1       wenzelju  102: 
                    103:         #Check for similar punctuation (case insensitive) or
1.4       bisitz    104:         #similarity related to similar phrases 
                    105:         if (lc($temp1) eq lc($temp2) || &similar_phrases($temp1,$temp2)) {
1.1       wenzelju  106:             #Find delimiter for key and value
                    107:             if (($kNEW=~/\'/) & ($kNEW=~/\"/)) {
                    108:                 print " (Warning: Both, ' and \", occur!)";
                    109:             }
                    110:             if ($kNEW=~/\'/) {
                    111: 	        $dlm = '"';
                    112: 	    } else {
                    113: 	        $dlm = "'";
                    114: 	    }
1.3       bisitz    115:             print (<<ENDNEW);
                    116: #   $kOLD #(Old key)
1.1       wenzelju  117:    $dlm$kNEW$dlm
                    118: => $dlm$vOLD$dlm,
                    119: 
                    120: ENDNEW
                    121:             $count++;
                    122: 
                    123:         }
                    124:     }
                    125: }
                    126: print("Finished. ".$count." similar expressions found!\n");
                    127: 
                    128: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>