Speech:GenTrans6

Summary
Title: genTrans6.pl

Author: David Meehan

Location: mnt/main/scripts/user/genTrans6.pl

Usage: /mnt/main/scripts/user/genTrans6.pl 

Code

 * 1) !/usr/bin/perl
 * 2) gentrans5 removes all instances between [] -[] []-

if ($#ARGV != 1) { print "usage: genTrans.pl  \n". " Example: /m              nt/main/corpus/switchboard/tiny/train 0011\n"; print "should be executed from the top level experiment directory ex: /mnt/mai              n/Exp/0011\n"; exit -1; }

$corpus_dir = $ARGV[0];
 * 1) set corpus directory

$trans_prefix = $ARGV[1];
 * 1) prefix is the exp_id

$trans_unedited = $corpus_dir. "/trans/train.trans";
 * 1) append the path the trans file based on the corpus dir provided

$train_trans = "etc/". $trans_prefix. "_train.trans"; $train_fileids = "etc/". $trans_prefix. "_train.fileids";
 * 1) set the output file names


 * 1) system ("rm $train_trans");
 * 2) system ("rm $train_fileids");

print "processing.";

open(MYINPUTFILE, "<$trans_unedited") || die("can't open file: $!"); open(MYOUTPUTFILE, ">>$train_trans"); open(MYIDFILE, ">>$train_fileids"); @lines = ; #load file into array $index = 0; print "\rgenTrans.pl 0% completed"; #display 0% completed foreach (@lines)           # read in file line by line {     $index++; $percent_done = int($index / @lines * 100); #calculate percent complete

$line = $_; chomp $line; $utteranceID = $line;         # copy line to new variable ### $utteranceID =~ s/sw[0-9]*//; # remove all characters prior to the sp               eaker identification $utteranceID =~ s/ .*//;      # remove all characters after the speaker a               nd utteranceID, this pulls out the utterance ID

#get sph name $sphName = $line; $sphName =~ m/sw[0-9]*/; #match to substring sw0...? $sphName = $&;          #grab match $sphName =~ s/^sw/sw0/; #replace instance of sw with sw0 $sphName = $sphName. ".sph";

#get start & stop time then calculate duration $time = $line;                          # copy line to new variable $time =~ /(\d+\.\d+) (\d+\.\d+)/; #look for one or more digits seperated b              y a period $start = $1; #get first item in from regular expression $stop = $2; #get second item in from regular expression $duration = $stop - $start; #get duration of utterance

$message = $line;                          # copy line to new variable $message =~ s/sw[0-9]*[A-B]-ms98-a-[0-9]* [0-9]*.[0-9]* [0-9]*.[0-9]* //;                # remove everything before the message #$message =~ s/\"//g; removed new transcripts do not have this notation     #$message =~ s/\[.*?\/.*?\]//g;      #$message =~ s/-\[.*?\]//g;      #$message =~ s/\[.*?\]-//g;      #$message =~ s/\^\*\]//g;      #$message =~ s/\/.*?]//g;      $message =~ s/\[noise\]//g; #remove [noise]      $message =~ s/\[laughter\]//g; #remove [laughter]      $message =~ s/\[vocalized-noise\]//g; #remove [laughter]      #$message =~ s/\[vocalized-noise\]//g; #remove [vocalized-noise]      #$message =~ s/\[laughter-//g; #remove [laughter-      #$message =~ s/\[.*?\///g; #remove [ /      #$message =~ s/(\si\-\s)/ i /g; #replace i- with i      $message =~ s/<.*?>//g; # remove < >      #$message =~ s/-\[//g; #remove -[      #$message =~ s/\]-//g; #remove ]-      #$message =~ s/\]//g; #remove ]      #$message =~ s/\[//g; #remove [      #$message =~ s/{//g; #remove {      #$message =~ s/}//g; #remove } #$message =~ s/\_1//g; #remove _1 #$message =~ s/-/ /g; #replace - with space $message =~ s/ / /g; #replace double space with single space $message = uc $message; #all text to uppercase

#make me some sph files $sysCmd = "sox -U ". $corpus_dir. "/wav/". $sphName. " -a wav/temp.wav               trim ". $start. " " . $duration; system($sysCmd); if( $? == -1){ die "\n Error executing: $sysCmd\nIs sox installed?\n"; }

$sysCmd = "sox wav/temp.wav wav/". $utteranceID. ".sph"; system($sysCmd); if( $? == -1){ die "\n Error executing: $sysCmd\nIs sox installed?\n"; }

$newTranscript = " $message ($utteranceID)"; print MYOUTPUTFILE "$newTranscript\n";                   # send transcrip               t to new file

print MYIDFILE "$utteranceID\n"; print "\rgenTrans.pl ". $percent_done. "% completed"; #display % complet              ed    }

close(MYINPUTFILE); close(MYOUTPUTFILE); close(MYIDFILE); print "\ndone";