Speech:PruneDictionary4.pl

Summary
Title: pruneDictionary4.pl

Author: David Meehan

Location: mnt/main/scripts/user/PruneDictionary4.pl

Usage: pruneDictionary2.pl

Description
This is a complete rebuild of the pruneDictionary script which significantly improves the performance of generating a dictionary file. What once could take hours on larger data now takes seconds.

Code

 * 1) !/usr/bin/perl

use Cwd 'abs_path'; use File::Basename;

if($#ARGV != 0) { print "Usage: pruneDictionary4.pl \n"; exit -1; }

$dict = "/mnt/main/corpus/switchboard/dist/dict/custom/switchboard.dic"; $trans = $ARGV[0]; %words;

$cmd = "sed -e 's/\\s\\+/\\n/g' $trans | grep -v \"sw.*\" | grep -v \"<\" | sort | uniq > tmp.dic"; system($cmd);

$tmpfile = "tmp.dic"; open(TMP, $tmpfile) or die "Could not open temp file!\n"; while($line = ) { chomp $line; $words{$line} = -1; }

$size = keys %words; print "Processing $size words against dictionary...\n";

open(DICT, $dict) or die "Error opening dictionary!\n"; $line_no = 0; while($line = ) { $word = substr($line, 0, index($line, ' ')); $phonemes = substr($line, index($line, ' ') + 1); if(exists $words{$word}) { #print "Found Word: $word\n"; $words{$word} = $phonemes; } }

if(-e "add.txt") { $cmd = "rm add.txt"; system($cmd); } open(ADD, ">add.txt"); open(NEWDICT, ">new.dic");

$missing = 0; while( my( $key, $value ) = each %words ){ if($value == -1) { print ADD "$key\n"; $missing++; } else { print NEWDICT "$key $value"; } } close(ADD); close(NEWDICT); $cmd = "sort new.dic > train.dic"; system($cmd); $cmd = "rm new.dic"; system($cmd); $cmd = "rm tmp.dic"; system($cmd);

print "--\nAdded $missing files to add.txt\nCreated train.dic\nDone!\n";