loncom/localize/localize/checksimilar_2files.pl - view

File: [LON-CAPA] / loncom / localize / localize / checksimilar_2files.pl
Revision 1.2: download - view: text, annotated - select for diffs
Tue Aug 3 13:00:21 2010 UTC (13 years, 9 months ago) by bisitz
Branches: MAIN
CVS tags: language_hyphenation_merge, language_hyphenation, HEAD, BZ4492-merge, BZ4492-feature_horizontal_radioresponse

Optimized detection of similar phrases.

1: #!/usr/bin/perl 2: # The LearningOnline Network with CAPA 3: # $Id: checksimilar_2files.pl,v 1.2 2010/08/03 13:00:21 bisitz Exp $ 4: 5: use strict; 6: use warnings; 7: use utf8; 8: use open ':utf8'; 9: 10: #### 11: #### Checks, if there are similar keys in the two inputfiles. 12: #### For example, check the current lang.pm (first input) and newphrases. 13: #### So if there are similar keys you don't have to translate 14: #### them again but use the old value and just modify it. 15: #### IMPORTANT: Both inputfiles have to contain a hash %Lexicon (like lang.pm) !!! 16: 17: 18: ####--------Subroutines--------#### 19: 20: sub read { 21: # Read file into memory 22: my $file = shift; 23: open(IN,$file) or die; 24: my %filecontent = (); 25: my $contents = join('',<IN>); 26: close(IN); 27: # Build hash with hash from file 28: my %Lexicon = (); 29: eval($contents.'; %filecontent=%Lexicon;'); 30: if ($@ ne "") { 31: print "\nAn error occurred during the attempt to retrieve the translation hash.\n" 32: ."Error: ".$@."\n"; 33: die; 34: } 35: return %filecontent; 36: } 37: 38: sub similarities{ 39: my $text = shift; 40: $text =~ s/[.,\_\-?!:]//g; 41: return $text; 42: } 43: 44: 45: 46: sub CourseCommunity { 47: 48: my $text1 = shift; 49: my $text2 = shift; 50: 51: $text1 =~ s/courses/X001X/gi; 52: $text1 =~ s/communities/X001X/gi; 53: $text1 =~ s/course/X002X/gi; 54: $text1 =~ s/community/X002X/gi; 55: $text2 =~ s/courses/X001X/gi; 56: $text2 =~ s/communities/X001X/gi; 57: $text2 =~ s/course/X002X/gi; 58: $text2 =~ s/community/X002X/gi; 59: 60: if(lc($text1) eq lc($text2)) { 61: return 1; 62: } 63: 64: return 0; 65: } 66: 67: 68: 69: ####--------Main Program--------#### 70: 71: my $file1 = $ARGV[0]; # Old language.pm 72: my $file2 = $ARGV[1]; # New Phrases 73: my %langOLD = &read($file1); #Hash with old phrases 74: my %langNEW = &read($file2); #Hash with new phrases 75: my $dlm; 76: my $count = 1; #Counter 77: 78: open(OUT,'>similarities.txt') or die; 79: 80: # For each new phrase, check if there is already a similar one 81: while( my ($kNEW, $vNEW) = each %langNEW ) { 82: my $temp1 = $kNEW; 83: $temp1 = &similarities($temp1); 84: 85: while( my ($kOLD, $vOLD) = each %langOLD ) { 86: my $temp2 = $kOLD; 87: $temp2 = &similarities($temp2); 88: 89: #Check for similar punctuation (case insensitive) or 90: #similarity related to Course/Community 91: if(lc($temp1) eq lc($temp2) || &CourseCommunity($temp1,$temp2)){ 92: #Find delimiter for key and value 93: if (($kNEW=~/\'/) & ($kNEW=~/\"/)) { 94: print " (Warning: Both, ' and \", occur!)"; 95: } 96: if ($kNEW=~/\'/) { 97: $dlm = '"'; 98: } else { 99: $dlm = "'"; 100: } 101: print OUT (<<ENDNEW); 102: #Old key: $kOLD 103: $dlm$kNEW$dlm 104: => $dlm$vOLD$dlm, 105: 106: ENDNEW 107: $count++; 108: 109: } 110: } 111: } 112: print("Finished. ".$count." similar expressions found!\n"); 113: 114: