File:  [LON-CAPA] / loncom / localize / localize / checksimilar_2files.pl
Revision 1.8: download - view: text, annotated - select for diffs
Fri Dec 20 14:27:54 2013 UTC (10 years, 4 months ago) by bisitz
Branches: MAIN
CVS tags: HEAD
Also ignore HTML tags

#!/usr/bin/perl
# The LearningOnline Network with CAPA
# $Id: checksimilar_2files.pl,v 1.8 2013/12/20 14:27:54 bisitz Exp $

use strict;
use warnings;
use utf8;
use open ':utf8';

####
#### Checks, if there are similar keys in the two inputfiles.
#### For example, check the current lang.pm (first input) and newphrases.
#### So if there are similar keys you don't have to translate
#### them again but use the old value and just modify it.
#### IMPORTANT: Both inputfiles have to contain a hash %Lexicon (like lang.pm) !!!


####--------Subroutines--------#### 

sub read {
    # Read file into memory
    my $file = shift;
    open(IN,$file) or die "Error: Could not open file: $file\n";
    my %filecontent = ();
    my $contents = join('',<IN>);
    close(IN);
    # Build hash with hash from file
    my %Lexicon = ();
    eval($contents.'; %filecontent=%Lexicon;');
    if ($@ ne "") {
        die "\nAn error occurred during the attempt to retrieve the translation hash.\n"
             ."Error: ".$@."\n";
    }
    return %filecontent;
}

sub similar_chars {
    my $text = shift;
    $text =~ s/<\/*\w+ *\/*>//g; # HTML tags
    $text =~ s/\[_\d\]//g; # translation parameters
    $text =~ s/[.,\_\-?!: \/\(\)]//g; # punctuation
    return $text;
}



sub similar_phrases {
    
    my $text1 = shift;
    my $text2 = shift;
    
    my %phrases = (
        'courses'     => 1,
        'communities' => 1,
        'course'      => 2,
        'community'   => 2,
        'member'      => 3,
        'student'     => 3,
        'students'    => 3,
        'construction'=> 4,
        'authoring'   => 4,
    );

    foreach my $word (keys %phrases) {
        $text1 =~ s/$word/X$phrases{$word}X/gi;
        $text2 =~ s/$word/X$phrases{$word}X/gi;
    }

    if (lc($text1) eq lc($text2)) {
        return 1;
    }
    
    return 0;
}



####--------Main Program--------####

if (!$ARGV[0] or !$ARGV[1]) {
    die 'Error: Invalid files! Please specify two files which should be checked.'."\n";
}

my $file1 = $ARGV[0];  # Old language.pm
my $file2 = $ARGV[1];  # New Phrases

print("Checking for similar expressions in phrases in $file1 and $file2...\n");

my %langOLD = &read($file1); #Hash with old phrases
my %langNEW = &read($file2); #Hash with new phrases
my $dlm; 
my $count = 0;

# For each new phrase, check if there is already a similar one
while( my ($kNEW, $vNEW) = each %langNEW ) {
    my $temp1 = $kNEW;
    $temp1 = &similar_chars($temp1);
   
    while( my ($kOLD, $vOLD) = each %langOLD ) {
        my $temp2 = $kOLD;
        $temp2 = &similar_chars($temp2);

        #Check for similar punctuation (case insensitive) or
        #similarity related to similar phrases 
        if (lc($temp1) eq lc($temp2) || &similar_phrases($temp1,$temp2)) {
            #Find delimiter for key and value
            if (($kNEW=~/\'/) & ($kNEW=~/\"/)) {
                print " (Warning: Both, ' and \", occur!)";
            }
            if ($kNEW=~/\'/) {
	        $dlm = '"';
	    } else {
	        $dlm = "'";
	    }
            print (<<ENDNEW);
#   $kOLD #(Old key)
   $dlm$kNEW$dlm
=> $dlm$vOLD$dlm,

ENDNEW
            $count++;

        }
    }
}
print("Finished. ".$count." similar expressions found!\n");



FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>