File:  [LON-CAPA] / loncom / localize / localize / checksimilar_1file.pl
Revision 1.2: download - view: text, annotated - select for diffs
Mon Jan 21 15:29:30 2013 UTC (11 years, 3 months ago) by bisitz
Branches: MAIN
CVS tags: HEAD
- Detect even more similarities by extending list of similar characters
  (include improvements from checksimilar_2files.pl 1.4)
- Offer switch to include check for similar phrases, e.g. course/community, as done in checksimilar_2files.pl

#!/usr/bin/perl
# The LearningOnline Network with CAPA
# $Id: checksimilar_1file.pl,v 1.2 2013/01/21 15:29:30 bisitz Exp $

use strict;
use warnings;

####
#### Checks, if there are similar keys in the inputfile (for example de.pm)
####

####--------Configuration--------####
# Include check for similar phrases -> set to 1
my $inclphrases = 0;


####--------Subroutines--------####



sub read {
    # Read file into memory
    my $fn = shift;
    open(IN,$fn) or die;
    my %filecontent = ();
    my $contents = join('',<IN>);
    close(IN);
    # Build hash with hash from file
    my %Lexicon=();
    eval($contents.'; %filecontent=%Lexicon;');

    return %filecontent;
}

sub similarities {
   my $text = shift;
   $text =~ s/\[_\d\]//g; # translation parameters
   $text =~ s/[.,\_\-?!: \/]//g; # punctuation
   if ($inclphrases) {
       $text =~ s/course/X002X/gi;
       $text =~ s/community/X002X/gi;
       $text =~ s/communities/X001X/gi;    
       $text =~ s/member/X003X/gi;
       $text =~ s/student/X003X/gi;
       $text =~ s/students/X003X/gi;
   }
   return $text;
}




####--------Main programm--------####


my $file = $ARGV[0];
my %lang=&read($file);
my $count = 0;
#Copy hash for comparision
my %lang2=%lang;
my %sim;

#For each key in the hash compare it with each other key in the hash except itself
while( my ($kOUT, $vOUT) = each %lang ) {

   #Delete the current key, so that it does not find itself
   #(revert this action later, see below) 
   delete $lang2{$kOUT};
   my $temp = $kOUT;
   $temp = &similarities($temp);
   
   while( my ($kIN, $vIN) = each %lang2 ) {
      my $temp2 = $kIN;
      $temp2 = &similarities($temp2);
         #Print key, if it has similarity to another key and if it has not been checked already
         if(lc($temp) eq lc($temp2) && !($sim{$kOUT})){
            print ('###'.$kOUT."###".$kIN."###\n");
            #Remeber key as already checked
            $sim{$kIN} = $kOUT; 
            $count++;
         }
   }

   $lang2{$kOUT}=$vOUT;
}
print("Finished. ".$count." similar keys found.\n");


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>