Speech:Summer 2012 copySph.pl


 * Home
 * Information

copySph.pl Perl Script
This script will analyze a transcript in the trans folder of a corpus dir and make symbolic links to all of the required files in the corresponding wav folder of that corpus dir. It takes one argument, which is the path to the corpus dir in question.

Example usage is ./copySph.pl /mnt/main/corpus/switchboard/mini/train

Source Code

 * 1) !/usr/bin/perl


 * 1) This script will make symbolic links to all the required sph files
 * 2) that are noted in a transcript file located in a particular corpus dir

if ($#ARGV != 0) { print "usage: copySph.pl \n". " Example: /mnt/main/corpus/switchboard/mini/train\n"; exit -1; }

$corpus_dir = $ARGV[0];
 * 1) set corpus directory

$trans_unedited = $corpus_dir. "/trans/train.trans";
 * 1) append the path the trans file based on the corpus dir provided

@sphFiles = ;
 * 1) create an array that will contain the list of sph files noted
 * 2) in the transcript

open(MYINPUTFILE, "<$trans_unedited") || die("can't open file: $!");
 * 1) open the transcript file for processing

while()               # read in file line by line {     $line = $_; chomp $line; $utteranceID = $line;         # copy line to new variable $utteranceID =~ s/ .*//;      # remove all characters after the speaker and utteranceID, this pulls out the utterance ID

#get sph name $sphName = $line; $sphName =~ m/sw[0-9]*/; #match to substring sw0...? $sphName = $&;          #grab match $sphName =~ s/^sw/sw0/; #replace instance of sw with sw0 $sphName = $sphName. ".sph"; push(@sphFiles, "$sphName"); #add the file name to the array } close(MYINPUTFILE);

%seen = ; @sphList = ; foreach $item(@sphFiles) {  unless($seen{$item}) {    $seen{$item} = 1; push(@sphList, $item); } }
 * 1) pull out all duplicate sph file names to create a unique list
 * 2) credit for this code snippet goes to the Perl Cookbook


 * 1) copy each sph file in the list to the wav dir in the corpus dir

foreach $item(@sphList) {  # create a new symbolic link that points to the original file and not # to the link to the original file $sysCmd = "cp -ps /mnt/main/corpus/dist/Switchboard/flat/$item $corpus_dir/wav/."; system($sysCmd); }

print "done\n";