PrepareExperiment2.pl

Summary
Title: prepareExperiment2.pl

Author: Ramon WHitman

Location: mnt/main/scripts/user/prepareExperiment2.pl

Usage: /mnt/main/scripts/user/prepareExperiment2.pl / (ex. first_5hr/train)

Description
This script automates the entire training process up to feats generation (it does not update senone value, density or other trainer parameters), relying on using symlinks for audio files to reduce the amount of space. This script also makes use of the newest versions of genTrans and pruneDictionary which drastically improves the performance of generating the transcript and dictionary files.

Code
use Cwd; use File::Basename;
 * 1) !/usr/bin/perl

if ($#ARGV != 0) { print "\nusage (run from main experiment directory): buildData.pl   \n"; print "    : The name of the corpus and data set. Ex: 10hr/train\n"; exit -1; }

$corpus_dir = "/mnt/main/corpus/switchboard/". $ARGV[0]; $last = substr($corpus_dir, -1); if($last eq '/') { chop $corpus_dir; }
 * 1) set corpus directory

$path = getcwd; $exp = basename($path);

print "Creating directory structure..."; $cmd = "/mnt/main/root/tools/SphinxTrain-1.0/scripts_pl/setup_SphinxTrain.pl -task $exp > /dev/null"; print "done!\nModifying sphinx_train.cfg..."; system($cmd); $cmd = "/mnt/main/root/sphinx3/scripts/setup_sphinx3.pl -task $exp > /dev/null"; system($cmd); $cmd = "sed -i s/^.CFG_HMM/TEMP/g etc/sphinx_train.cfg"; system($cmd); $cmd = "sed -i s/\#.CFG_HMM/\\\$CFG_HMM/g etc/sphinx_train.cfg"; system($cmd); $cmd = "sed -i s#/root/speechtools/SphinxTrain\-1\.0/train1#$path#g etc/sphinx_train.cfg"; system($cmd); $cmd = "sed -i s/train1/$exp/g etc/sphinx_train.cfg"; system($cmd); $cmd = "sed -i s/^TEMP/\#TEMP/g etc/sphinx_train.cfg"; system($cmd); print "done!\n";

$out = "data"; $filename = "etc/.train_info"; $cmd = "rmdir wav"; system($cmd); $cmd = "ln -s $corpus_dir/audio/utt wav"; if($? == -1) { print "failed!\n"; exit 0; } print "Linking to utterance files... "; system($cmd); if($? == -1) { print "failed!\n"; exit 0; } if($? == -1) { print "failed!\n"; exit 0; } print "done!\nPreparing data input files...\n"; print "Generating transcript file..."; $cmd = "/mnt/main/scripts/user/genTrans10.pl $corpus_dir $exp". "_train > /dev/null"; system($cmd); print "genTrans10.pl 100% complete!\nGenerating dictionary file..."; $cmd = "/mnt/main/scripts/user/pruneDictionary4.pl etc/$exp". "_train.trans $exp"; system($cmd); print "done!\nReplacing filler words in transcript..."; print "done!\nGenerating filler dictionary..."; $cmd = "cp -i /mnt/main/root/tools/SphinxTrain-1.0/train1/etc/train1.filler etc/$exp.filler"; system($cmd); $cmd = "echo '\[NOISE\] \+noise\+\n\[LAUGHTER\] \+laugh\+\n\[VOCALIZED-NOISE\] \+vocalized\+' >> etc/$exp.filler"; system($cmd); print "done!\nGenerating phones list..."; $cmd = "cp -i /mnt/main/scripts/user/genPhones.csh etc/. "; system($cmd); $cmd = "etc/genPhones.csh etc/$exp"; system($cmd); $cmd = "echo 'SIL\n\+laugh\+\n\+noise\+\n\+vocalized\+' >> etc/$exp.phone"; system($cmd); $cmd = "sort etc/$exp.phone -o etc/$exp.phone"; system($cmd); print "done!\nPreparation complete!\n";
 * 1) prefix is the exp_id
 * 1) $cmd = "sed -e -i 's/\\[LAUGHTER[^][]*\\]/\\ \\/g' $exp" . "_train.trans";
 * 2) system($cmd);
 * 3) $cmd = "sed -i s/\\\\\\[LAUGHTER\\\\\\]/\\/g etc/$exp" . "_train.trans";
 * 4) system($cmd);
 * 5) $cmd = "sed -i s/\\\\\\[NOISE\\\\\\]/\\/g etc/$exp" . "_train.trans";
 * 6) system($cmd);
 * 7) $cmd = "sed -i s/\\\\\\[VOCALIZED\\-NOISE\\\\\\]/\\/g etc/$exp" . "_train.trans";
 * 8) system($cmd);