Speech:GenTrans.pl

=Summary= Title: genTrans.pl Authors: Unknown Location: /mnt/main/scripts/user/ /mnt/main/scripts/user/History/genTrans/15 /mnt/main/scripts/user/History/genTrans/cur Archive: Older versions Called by: makeTrain.pl v5 Usages: Usage  #1: genTrans.pl [-d]   Usage  #2: genTrans.pl   Examples: Example #1: genTrans.pl [-e] switchboard 300hr/test 001 Example #2: genTrans.pl switchboard 30hr/train 001

=Description= This script will take in an optional flag of either -d -e and -t a master corpus, a sub corpus directory and an experiment id as arguments.

The purpose for this script is to allow the user to generate transcripts to the desired experiment by creating symlinks from the corpus to the wav directory within the experiment.

=Code= =begin comment Generate Transcripts (genTrans) Semester: Spring 2016 Start Date: 4/10/16 Last Modified: 4/14/16 Recent changes: - Added help information - Made $flag optional
 * 1) !/usr/bin/perl

flags -d use dev.trans -e use eval.trans -t use train.trans =cut

if (($#ARGV != 3) and ($#ARGV != 2)) { print "Usage   #1: genTrans.pl [-d]  \n"; print "Example #1: genTrans.pl [-e] switchboard 300hr/test 001 \n"; print "Usage  #2: genTrans.pl  \n"; print "Example #2: genTrans.pl switchboard 30hr/train 001"; print "Information: Run from the main experiment directory ex: /mnt/main/Exp/0280/001\n"; print "Flag -d: Points to the /trans/dev.trans\n"; print "Flag -e: Points to the /trans/eval.trans\n"; print "Flag -t: Points to the /trans/train.trans\n"; exit -1; }

$train_id = 'train.trans';

if($#ARGV == 3) { $flag = $ARGV[0]; #set Flag $corpus = $ARGV[1]; #set corpus $corpus_dir = $ARGV[2]; #set corpus directory $trans_prefix = $ARGV[3]; #prefix is the exp_id
 * 1) Choose whether to use test corpus or train corpus

$corpus_path = "/mnt/main/corpus/$corpus";

if ($flag eq '-d') {$trans_id = 'dev.trans';} elsif ($flag eq '-e') {$trans_id = 'eval.trans';} elsif ($flag eq '-t') {$trans_id = 'train.trans';}

$trans_unedited = "$corpus_path/$corpus_dir/trans/$trans_id"; } else { $corpus = $ARGV[0]; #set corpus $corpus_dir = $ARGV[1]; #set corpus directory $trans_prefix = $ARGV[2]; #prefix is the exp_id

$corpus_path = "/mnt/main/corpus/$corpus";

$trans_unedited = "$corpus_path/$corpus_dir/trans/$train_id"; }

print "$trans_unedited\n"; #troubleshooting print

$train_trans = "etc/". $trans_prefix. "_train.trans"; $train_fileids = "etc/". $trans_prefix. "_train.fileids";
 * 1) set the output file names

print "Processing\n";

open(MYINPUTFILE, "<$trans_unedited") || die("can't open file: $!"); open(MYOUTPUTFILE, ">>$train_trans"); open(MYIDFILE, ">>$train_fileids"); @lines = ; #load file into array $index = 0; foreach (@lines) # read in file line by line {	$index++; $percent_done = int($index / @lines * 100); #calculate percent complete $line = $_; chomp $line; $utteranceID = $line; #copy line to new variable

#$utteranceID =~ s/sw[0-9]*//; # remove all characters prior to the speaker identification

$utteranceID =~ s/ .*//; # remove all characters after the speaker and utteranceID, this pulls out the utterance ID

#get sph name $sphName = $line; $sphName =~ m/sw[0-9]*/; #match to substring sw0...? $sphName = $&; #grab match $sphName =~ s/^sw/sw0/; #replace instance of sw with sw0 $sphName = $sphName. ".sph";

#get start & stop time then calculate duration $time = $line; # copy line to new variable $time =~ /(\d+\.\d+) (\d+\.\d+)/; #look for one or more digits seperated by a period $start = $1; #get first item in from regular expression $stop = $2; #get second item in from regular expression $duration = $stop - $start; #get duration of utterance

$message = $line;	# copy line to new variable $message =~ s/sw[0-9]*[A-B]-ms98-a-[0-9]* [0-9]*.[0-9]* [0-9]*.[0-9]* //; #remove everything before the message

#$message =~ s/\"//g; removed new transcripts do not have this notation	#$message =~ s/\[.*?\/.*?\]//g;	#$message =~ s/-\[.*?\]//g;	#$message =~ s/\[.*?\]-//g;	#$message =~ s/\^\*\]//g;	#$message =~ s/\/.*?]//g;

$message =~ s/\[noise\]\s//g; #remove [noise] $message =~ s/\[laughter\]\s//g; #remove [laughter] $message =~ s/\[vocalized-noise\]\s//g; #remove [laughter]

#$message =~ s/\[vocalized-noise\]//g; #remove [vocalized-noise] #$message =~ s/\[laughter-//g; #remove [laughter- #$message =~ s/\[.*?\///g; #remove [ / #$message =~ s/(\si\-\s)/ i /g; #replace i- with i

$message =~ s/<.*?>//g; # remove < >

#$message =~ s/-\[//g; #remove -[ #$message =~ s/\]-//g; #remove ]- #$message =~ s/\]//g; #remove ] #$message =~ s/\[//g; #remove [ #$message =~ s/{//g; #remove { #$message =~ s/}//g; #remove } #$message =~ s/\_1//g; #remove _1 #$message =~ s/-/ /g; #replace - with space

$message =~ s/ / /g; #replace double space with single space $message = uc $message; #all text to uppercase

#Allows the user to select which utt directory to make the .sph to wav ln	system("ln -s $corpus_path/$corpus_dir/audio/utt/$utteranceID.sph ./wav/$utteranceID.sph");

$newTranscript = " $message ($utteranceID)"; print MYOUTPUTFILE "$newTranscript\n"; # send transcript to new file print MYIDFILE "$utteranceID\n"; print "\rgenTrans.pl ". $percent_done. "% completed"; #display % completed } close(MYINPUTFILE); close(MYOUTPUTFILE); close(MYIDFILE); print "Done!\n";