Diff for /loncom/localize/localize/checksimilar_2files.pl between versions 1.1 and 1.8

version 1.1, 2010/03/09 15:16:26 version 1.8, 2013/12/20 14:27:54
Line 20  use open ':utf8'; Line 20  use open ':utf8';
 sub read {  sub read {
     # Read file into memory      # Read file into memory
     my $file = shift;      my $file = shift;
     open(IN,$file) or die;      open(IN,$file) or die "Error: Could not open file: $file\n";
     my %filecontent = ();      my %filecontent = ();
     my $contents = join('',<IN>);      my $contents = join('',<IN>);
     close(IN);      close(IN);
Line 28  sub read { Line 28  sub read {
     my %Lexicon = ();      my %Lexicon = ();
     eval($contents.'; %filecontent=%Lexicon;');      eval($contents.'; %filecontent=%Lexicon;');
     if ($@ ne "") {      if ($@ ne "") {
         print "\nAn error occurred during the attempt to retrieve the translation hash.\n"          die "\nAn error occurred during the attempt to retrieve the translation hash.\n"
              ."Error: ".$@."\n";               ."Error: ".$@."\n";
         die;  
     }      }
     return %filecontent;      return %filecontent;
 }  }
   
 sub similarities{  sub similar_chars {
     my $text = shift;      my $text = shift;
     $text =~ s/[.,\_\-?!:]//g;      $text =~ s/<\/*\w+ *\/*>//g; # HTML tags
       $text =~ s/\[_\d\]//g; # translation parameters
       $text =~ s/[.,\_\-?!: \/\(\)]//g; # punctuation
     return $text;      return $text;
 }  }
   
   
   
 sub CourseCommunity {  sub similar_phrases {
           
     my $text1 = shift;      my $text1 = shift;
     my $text2 = shift;      my $text2 = shift;
           
     $text1 =~ s/courses//gi;      my %phrases = (
     $text1 =~ s/communities//gi;              'courses'     => 1,
     $text1 =~ s/course//gi;          'communities' => 1,
     $text1 =~ s/community//gi;          'course'      => 2,
     $text2 =~ s/courses//gi;          'community'   => 2,
     $text2 =~ s/communities//gi;          'member'      => 3,
     $text2 =~ s/course//gi;          'student'     => 3,
     $text2 =~ s/community//gi;          'students'    => 3,
           'construction'=> 4,
           'authoring'   => 4,
       );
   
       foreach my $word (keys %phrases) {
           $text1 =~ s/$word/X$phrases{$word}X/gi;
           $text2 =~ s/$word/X$phrases{$word}X/gi;
       }
   
     if(lc($text1) eq lc($text2)) {      if (lc($text1) eq lc($text2)) {
         return 1;          return 1;
     }      }
           
Line 68  sub CourseCommunity { Line 77  sub CourseCommunity {
   
 ####--------Main Program--------####  ####--------Main Program--------####
   
   if (!$ARGV[0] or !$ARGV[1]) {
       die 'Error: Invalid files! Please specify two files which should be checked.'."\n";
   }
   
 my $file1 = $ARGV[0];  # Old language.pm  my $file1 = $ARGV[0];  # Old language.pm
 my $file2 = $ARGV[1];  # New Phrases  my $file2 = $ARGV[1];  # New Phrases
   
   print("Checking for similar expressions in phrases in $file1 and $file2...\n");
   
 my %langOLD = &read($file1); #Hash with old phrases  my %langOLD = &read($file1); #Hash with old phrases
 my %langNEW = &read($file2); #Hash with new phrases  my %langNEW = &read($file2); #Hash with new phrases
 my $dlm;   my $dlm; 
 my $count = 1; #Counter  my $count = 0;
   
 open(OUT,'>similarities.txt') or die;  
   
 # For each new phrase, check if there is already a similar one  # For each new phrase, check if there is already a similar one
 while( my ($kNEW, $vNEW) = each %langNEW ) {  while( my ($kNEW, $vNEW) = each %langNEW ) {
     my $temp1 = $kNEW;      my $temp1 = $kNEW;
     $temp1 = &similarities($temp1);      $temp1 = &similar_chars($temp1);
         
     while( my ($kOLD, $vOLD) = each %langOLD ) {      while( my ($kOLD, $vOLD) = each %langOLD ) {
         my $temp2 = $kOLD;          my $temp2 = $kOLD;
         $temp2 = &similarities($temp2);          $temp2 = &similar_chars($temp2);
   
         #Check for similar punctuation (case insensitive) or          #Check for similar punctuation (case insensitive) or
         #similarity related to Course/Community           #similarity related to similar phrases 
         if(lc($temp1) eq lc($temp2) || &CourseCommunity($temp1,$temp2)){          if (lc($temp1) eq lc($temp2) || &similar_phrases($temp1,$temp2)) {
             #Find delimiter for key and value              #Find delimiter for key and value
             if (($kNEW=~/\'/) & ($kNEW=~/\"/)) {              if (($kNEW=~/\'/) & ($kNEW=~/\"/)) {
                 print " (Warning: Both, ' and \", occur!)";                  print " (Warning: Both, ' and \", occur!)";
Line 98  while( my ($kNEW, $vNEW) = each %langNEW Line 112  while( my ($kNEW, $vNEW) = each %langNEW
     } else {      } else {
         $dlm = "'";          $dlm = "'";
     }      }
             print OUT (<<ENDNEW);              print (<<ENDNEW);
 #Old key: $kOLD  #   $kOLD #(Old key)
    $dlm$kNEW$dlm     $dlm$kNEW$dlm
 => $dlm$vOLD$dlm,  => $dlm$vOLD$dlm,
   

Removed from v.1.1  
changed lines
  Added in v.1.8


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>