Speech:GenTrans5

Summary
Title: genTrans5.pl

Author: unknown

Location: mnt/main/scripts/user/genTrans5.pl

Usage:

Description
Fixes some regular expressions in genTrans4 and adds more regular expressions by request of the project manager.

Code

 * 1) !/usr/bin/perl
 * 2) gentrans5 removes all instances between [] -[] []-

if ($#ARGV != 1) { print "usage: genTrans.pl  \n". " Example: /m              nt/main/corpus/switchboard/tiny/train 0011\n"; print "should be executed from the top level experiment directory ex: /mnt/mai              n/Exp/0011\n"; exit -1; }

$corpus_dir = $ARGV[0];
 * 1) set corpus directory

$trans_prefix = $ARGV[1];
 * 1) prefix is the exp_id

$trans_unedited = $corpus_dir. "/trans/train.trans";
 * 1) append the path the trans file based on the corpus dir provided

$train_trans = "etc/". $trans_prefix. "_train.trans"; $train_fileids = "etc/". $trans_prefix. "_train.fileids";
 * 1) set the output file names


 * 1) system ("rm $train_trans");
 * 2) system ("rm $train_fileids");

print "Begin processing ...\n";

open(MYINPUTFILE, "<$trans_unedited") || die("can't open file: $!"); open(MYOUTPUTFILE, ">>$train_trans"); open(MYIDFILE, ">>$train_fileids");

@lines = ; $index = 0;
 * 1) Load file into array.

print "\rStatus: 0% completed";
 * 1) Display 0% completed.

foreach (@lines) {   $index++; # Calculate percent complete. $percent_done = int($index / @lines * 100);
 * 1) Read in file line by line.

$line = $_; chomp $line;

# Copy line to new variable. $utteranceID = $line;

# Remove all characters prior to the speaker identification. ### $utteranceID =~ s/sw[0-9]*//;

# Remove all characters after the speaker and utteranceID, this pulls out th              e utterance ID. $utteranceID =~ s/ .*//;

# Get sph name. $sphName = $line; $sphName =~ m/sw[0-9]*/; #match to substring sw0...? $sphName = $&;          #grab match $sphName =~ s/^sw/sw0/; #replace instance of sw with sw0 $sphName = $sphName. ".sph";

#get start & stop time then calculate duration $time = $line;                            # copy line to new variable $time =~ /(\d+\.\d+) (\d+\.\d+)/; #look for one or more digits seperated by               a period $start = $1; #get first item in from regular expression $stop = $2; #get second item in from regular expression $duration = $stop - $start; #get duration of utterance

$message = $line;                            # copy line to new variable $message =~ s/sw[0-9]*[A-B]-ms98-a-[0-9]* [0-9]*.[0-9]* [0-9]*.[0-9]* //; #                remove everything before the message #$message =~ s/\"//g; removed new transcripts do not have this notation   $message =~ s/\[.*?\/.*?\]//g;    $message =~ s/-\[.*?\]//g;    $message =~ s/\[.*?\]-//g;    $message =~ s/\^\*\]//g;    #$message =~ s/\/.*?]//g;    #$message =~ s/\[noise\]//g; #remove [noise]    #$message =~ s/\[laughter\]//g; #remove [laughter]    #$message =~ s/\[vocalized-noise\]//g; #remove [vocalized-noise]    #$message =~ s/\[laughter-//g; #remove [laughter-    #$message =~ s/\[.*?\///g; #remove [ /    $message =~ s/(\si\-\s)/ i /g; #replace i- with i    $message =~ s/<.*?>//g; # remove < >    #$message =~ s/-\[//g; #remove -[    #$message =~ s/\]-//g; #remove ]-    #$message =~ s/\]//g; #remove ]    #$message =~ s/\[//g; #remove [    $message =~ s/{//g; #remove {    $message =~ s/}//g; #remove }    $message =~ s/\_1//g; #remove _1    $message =~ s/-/ /g; #replace - with space    $message =~ s/  / /g; #replace double space with single space $message = uc $message; #all text to uppercase

# Make me some sph files. $sysCmd = "sox -U ". $corpus_dir. "/wav/". $sphName. " -a wav/temp.wav t              rim ". $start. " " . $duration; system($sysCmd); if($? == -1) { die "\n Error executing: $sysCmd\nIs sox installed?\n"; }

$sysCmd = "sox wav/temp.wav wav/". $utteranceID. ".sph"; system($sysCmd); if($? == -1) { die "\n Error executing: $sysCmd\nIs sox installed?\n"; }

$newTranscript = " $message ($utteranceID)"; # Send transcript to new file. print MYOUTPUTFILE "$newTranscript\n";

print MYIDFILE "$utteranceID\n";

# Display the % completed - only show 10, 20, 30 etc... if($percent_done % 10 == 0) { print "\rStatus: ". $percent_done. "% completed"; } }

close(MYINPUTFILE); close(MYOUTPUTFILE); close(MYIDFILE); print "\nDone!";