Speech:Summer 2012 pruneDictionary.pl


 * Home
 * Information

pruneDictionary Perl Script
This dictionary works with the dictionary.pl script to create a new dictionary file from the master dictionary that only contains entries that are found in the transcript.

Source Code

 * 1) !/usr/bin/perl

if ($#ARGV !=2) { print "Usage: pruneDictionary.pl  \n"; exit -1; }

$trans_file = $ARGV[0]; $dict = $ARGV[1]; $output_file = $ARGV[2];
 * 1) Set variables from command-line args

$temp_file = "temp"; $temp_pruned = "pruned";

$sysCmd = "cat $trans_file | text2wfreq | sort | grep -v \\(sw > $temp_file"; system($sysCmd);
 * 1) This runs text2wfreq which gives a uniqe list of all the words that appear in the transcript
 * 2) including how many times each word appears.  Unfortunately that includes the (swxxx) statements
 * 3) Those results are sorted and fed to grep which yanks out the sw statement lines and outputs the
 * 4) results to a temp file.

open(MYINPUTFILE, "<$temp_file"); open(MYOUTPUTFILE, ">>$temp_pruned");
 * 1) open the temp files for processing

while() { $line = $_; chomp $line; $line =~ s/\s*[0-9]*//g; print MYOUTPUTFILE "$line\n"; }
 * 1) for each word in the temp word list this loop removes
 * 2) all withe space and any numbers from the word count.
 * 3) it then sames the results in a temporary pruned file.

$sysCmd = "/mnt/main/scripts/train/scripts_pl/dictionary.pl $temp_pruned $dict $output_file"; system($sysCmd);
 * 1) This calls the dictionary script which will create a new dictionary that only contains the words in the
 * 2) Pruned list.

$sysCmd = "rm $temp_pruned $temp_file"; system($sysCmd);
 * 1) remove temporary files