Speech:CreateUtts.pl

=Summary= Title: createUtts.pl -- newer: genUttAudio.pl Authors: James (primary), Jon (helping) -- Modeling Group SP16 Location: /mnt/main/scripts/user/ Usage: createUtts.pl  
 * Example: createUtts.pl /mnt/main/corpus/switchboard/256hr/train/trans/train.trans /mnt/main/corpus/switchboard/256hr_new/train/audio/utt/ /mnt/main/corpus/switchboard/256hr_new/info/logs)

=Description= This script takes in a transcript file, and from that, generates audio utterance files in the specified /utt/ directory and also generates three logs in the specified /logs/ directory.

The following logs are generated: Content & Format  Content & Format  Content & Format  =Code=
 * train.log
 * utt.log
 * conv.log
 * 1) !/usr/bin/perl


 * 1) Authors: James (primary), Jon (helping) -- Modeling Group SP16


 * 1) -Description
 * 2) Takes in a transcript file (i.e. train.trans) and creates utterances from
 * 3) the conversation audio files in /mnt/main/corpus/switchboard/dist/flat
 * 4) Usage: createUtts.pl /absolute/path/to/train.trans /absolute/path
 * 5) /to/directory you want the utts in/ /absolute/path/to/log directory/


 * 1) -Pseudocode-
 * 2) Get arguments
 * 3) Open file
 * 4) Start loop
 * 5) Successively read each line
 * 6) Throw full file name into variable
 * 7) Throw a formatted file name with a 0 after the w and the letter taken off the end
 * 8) (i.e. sw02345 instead of sw2345A) into a variable
 * 9) Throw the start time into a vaiable
 * 10) Throw the end time into a variable
 * 11) Throw the diff between end time and start time into a variable
 * 12) Get the channel
 * 13) Use sox command like so: sox filein fileout trim start duration remix (1 or 2,
 * 14) depending on the channel)
 * 15) Log info
 * 16) End loop
 * 17) Close file


 * 1) -Start of code--

$trainFile = $ARGV[0]; $targetDirectory = $ARGV[1]; $logDirectory = $ARGV[2];
 * 1) Get arguments

open FIN, "<", $trainFile;
 * 1) Open transcript file for reading

while (my $entry = ) {   # Fill array with items in entry (i. e. file name, start time, etc.) my @entryItems = split ' ', $entry;
 * 1) Process each entry in the transcript file and create a corresponding utterance audio file

# Copy full file name (i.e. sw3041A-ms98-a-0002) my $fullFileName = $entryItems[0];

# Creating a formatted file name to find in the flat directory my $part1FileName = substr $fullFileName, 0, 2; # sw -- Using the example full file name above my $part2FileName = substr $fullFileName, 2, 4; # 3041 my $formattedFileName = $part1FileName. "0" . $part2FileName; # sw03041

# Get start and end times and get the duration (the diff) my $startTime = $entryItems[1]; my $endTime = $entryItems[2]; my $duration = $endTime - $startTime;

# Get channel my $channel = substr $fullFileName, 6, 1; # A or B

# Use the sox command to create an utterance audio file given the current entry in the transcript if ($channel eq "A" || $channel eq "a") # Use channel 1 a.k.a. speaker A   { $soxCmd = "sox /mnt/main/corpus/switchboard/dist/flat/". $formattedFileName. ".sph ". $targetDirectory. $fullFileName. ".sph trim ". $startTime. " " . $duration. " remix 1"; system($soxCmd); }   else # Use channel 2 a.k.a. speaker B    { $soxCmd = "sox /mnt/main/corpus/switchboard/dist/flat/". $formattedFileName. ".sph ". $targetDirectory. $fullFileName. ".sph trim ". $startTime. " " . $duration. " remix 2"; system($soxCmd); }

# Log train data $trainLogData = $startTime. "\t". $endTime. "\t". $duration; # If duration is negative, bad $trainLogCmd = "echo ". $trainLogData. " >> " . $logDirectory. "train.log"; system($trainLogCmd);

# Log utt data $expUttDuration = $duration; $actUttDurationCmd = "echo `wc -c < $targetDirectory$fullFileName.sph` / 8000 | bc -l"; $actUttDuration = `$actUttDurationCmd`; $lengthUtt = length($actUttDuration); $actUttDuration = substr $actUttDuration, 0, $lengthUtt - 1; $uttDurationDiff = $actUttDuration - $expUttDuration; # If not close in value, bad $uttLogData = $expUttDuration. "\t". $actUttDuration. "\t". $uttDurationDiff; $uttLogCmd = "echo ". $uttLogData. " >> " . $logDirectory. "utt.log"; system($uttLogCmd);

# Log conv data $uttEndTime = $endTime; $convDurationCmd = "echo `wc -c < /mnt/main/corpus/switchboard/dist/flat/$formattedFileName.sph` / 8000 | bc -l"; $convDuration = `$convDurationCmd`; $lengthConv = length($convDuration); $convDuration = substr $convDuration, 0, lengthConv - 1; $convUttDiff = $convDuration - $uttEndTime; # If not positive or 0, bad $convLogData = $uttEndTime. "\t". $convDuration. "\t". $convUttDiff; $convLogCmd = "echo ". $convLogData. " >> " . $logDirectory. "conv.log"; system($convLogCmd);

# Give something for the user to see print $soxCmd. "\n"; }

close FIN;


 * 1) -End of code