Speech:CheckTrain.pl

Summary
This script checks that each transcript entry matches a corresponding .sph file. Each .sph file has a 4 digit number to identify it. Each transcript entry has a 5 digit number (4 matching and 1 for multiples). This script simply grabs the 4 digits from each and makes sure they all have a match.

It takes the name of the training directory as an argument.

Author
Jared Rohrdanz

Code
use strict; use warnings;
 * 1) !/usr/bin/perl

my $train = $ARGV[0]; my $dir = "/mnt/main/corpus/switchboard/$train/train/wav"; my @wav; my @trans;

opendir (WAV, $dir) or die "can't open $dir: $!";

while ((my $filename = readdir(WAV))) { next if $filename =~ /^\./; $filename = substr($filename,3,4); push @wav, $filename; }
 * 1) closedir(WAV);

open 'trans', '<', "/mnt/main/corpus/switchboard/$train/train/trans/train.trans" or die $!;

while (my $row = ) {

my $trans_num = substr($row,2,4); push @trans, $trans_num }

my %seen = ; my @uniq_trans; my $item; foreach $item (@trans) { push (@uniq_trans, $item) unless $seen{$item}++; }

@uniq_trans = sort { $a <=> $b } @uniq_trans; @wav = sort { $a <=> $b } @wav;

if ( @uniq_trans @wav) {print "The audio files and transcripts match.\n"; } else{ print "The audio files don't match the transcripts.\n";}
 * 1) print "transcripts from a .sph file... \n @uniq_trans \n";
 * 2) print ".sph files... \n @wav \n";