sts

Sleeping

App Files Files Community

sts / uroman /bin /string-distance.pl

multimodalart HF staff

First commit

7bcf8d7 over 1 year ago

raw

history blame

3.22 kB

	#!/usr/bin/perl -w

	# Author: Ulf Hermjakob
	# Release date: October 13, 2019

	# Usage: string-distance.pl {-lc1 <language-code>} {-lc2 <language-code>} < STDIN > STDOUT
	# Example: string-distance.pl -lc1 rus -lc2 ukr < STDIN > STDOUT
	# Example: string-distance.pl < ../test/string-similarity-test-input.txt
	# Input format: two strings per line (tab-separated, in Latin script)
	# Strings in non-Latin scripts should first be romanized. (Recommended script: uroman.pl)
	# Output format: repetition of the two input strings, plus the string distance between them (tab-separated).
	# Additional output meta info lines at the top are marked with an initial #.
	#
	# The script uses data from a string-distance-cost-rules file that lists costs,
	# where the default cost is "1" with lower costs for differences in vowels,
	# duplicate consonants, "f" vs. "ph" etc.
	# Language cost rules can be language-specific and context-sensitive.

	$\|=1;

	use FindBin;
	use Cwd "abs_path";
	use File::Basename qw(dirname);
	use File::Spec;

	my $bin_dir = abs_path(dirname($0));
	my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
	my $data_dir = File::Spec->catfile($root_dir, "data");
	my $lib_dir = File::Spec->catfile($root_dir, "lib");

	use lib "$FindBin::Bin/../lib";
	use List::Util qw(min max);
	use NLP::utilities;
	use NLP::stringDistance;
	$util = NLP::utilities;
	$sd = NLP::stringDistance;
	$verbose = 0;
	$separator = "\t";

	$cost_rule_filename = File::Spec->catfile($data_dir, "string-distance-cost-rules.txt");

	$lang_code1 = "eng";
	$lang_code2 = "eng";
	%ht = ();

	while (@ARGV) {
	$arg = shift @ARGV;
	if ($arg =~ /^-+lc1$/) {
	$lang_code_candidate = shift @ARGV;
	$lang_code1 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
	} elsif ($arg =~ /^-+lc2$/) {
	$lang_code_candidate = shift @ARGV;
	$lang_code2 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
	} elsif ($arg =~ /^-+(v\|verbose)$/) {
	$verbose = shift @ARGV;
	} else {
	print STDERR "Ignoring unrecognized arg $arg\n";
	}
	}

	$sd->load_string_distance_data($cost_rule_filename, *ht, $verbose);
	print STDERR "Loaded resources.\n" if $verbose;

	my $chart_id = 0;
	my $line_number = 0;
	print "# Lang-code-1: $lang_code1 Lang-code-2: $lang_code2\n";
	while (<>) {
	$line_number++;
	if ($verbose) {
	if ($line_number =~ /000$/) {
	if ($line_number =~ /0000$/) {
	print STDERR $line_number;
	} else {
	print STDERR ".";
	}
	}
	}
	my $line = $_;
	$line =~ s/^\xEF\xBB\xBF//;
	next if $line =~ /^\s(\#.)?$/;
	my $s1;
	my $s2;
	if (($s1, $s2) = ($line =~ /^("(?:\\"\|[^"])"\|\S+)$separator("(?:\\"\|[^"])"\|\S+)\s*$/)) {
	$s1 = $util->dequote_string($s1);
	$s2 = $util->dequote_string($s2);
	} elsif ($line =~ /^\s(#.)$/) {
	} else {
	print STDERR "Could not process line $line_number: $line" if $verbose;
	print "\n";
	next;
	}

	$cost = $sd->quick_romanized_string_distance_by_chart($s1, $s2, *ht, "", $lang_code1, $lang_code2);
	print "$s1\t$s2\t$cost\n";
	}
	print STDERR "\n" if $verbose;

	exit 0;