Speech:Spring 2018 Software Group corpus.c

File Location
../sphinx3/src/libs3decoder/libcommon/corpus.c

General Notes
utt_res_t A structure to store utterance-based resource Assume that most resources are string pointers, the string itself is pre-allocated somewhere. char* utt_res_t::cb2mllrname The code book to regression matrix file name for this utterance char* utt_res_t::fsgname FSG file name for this utterance. For one utterance, one could only use either LM or fsg char* utt_res_t::lmname LM file name for this utterance char* utt_res_t::regmatname The regression matrix file name for this utterance char* utt_res_t::uttfile Utterance file name corpus_t Structure for a corpus: essentially a set of strings each associated with a unique ID. Structure for a corpus: essentially a set of strings each associated with a unique ID. (Such as a reference sentence file, hypothesis file, and various control files.) NOTE: IDs are CASE-SENSITIVE. hash_table_t* corpus_t::ht Hash table for IDs; CASE-SENSITIVE int32 corpus_t::n #IDs (and corresponding argument strings) in the corpus char** corpus_t::str The argument strings

new_utt_res
utt_res_t * new_utt_res

free_utt_res
void free_utt_res(utt_res_t * ur)

Parameters
ur an utt_res_t

report_utt_res
void report_utt_res(utt_res_t * ur)

Parameters
ur an utt_res_t

corpus_load_headid
corpus_t* corpus_load_headid	( 	const char * 	file,                                 int32(*)(char *str) 	validate,                                  int32(*)(char *s1, char *s2) 	dup_resolve                                  )

Parameters
file 	Input file name, the file must be seekable and rewindable

Validate
validate is an optional application-supplied function to determine if each input corpus data entry is eligible (valid) for inclusion in the final corpus. It should return an integer value signifying the following actions: 0: Not valid, skip the entry; !0: Valid, include the entry. If validate is NULL, every input entry is included in the corpus.

dup_resolve
dup_resolve is an optional application-supplied function to resolve duplicate keys (IDs). It may be NULL if none is available. If present, and a duplicate key is encountered, the function is invoked with the original and the duplicate corpus strings as arguments (s1 and s2, respectively). It should return an integer value signifying the following actions: 0: Retain the original string, discard the new one; >0: Replace the original string with the new one; <0: Error (causes a FATAL_ERROR). If dup_resolve is NULL, any duplicate ID causes a FATAL_ERROR.

sep_tailid
static int32 sep_tailid(char *line, char *uttid)

corpus_load_tailid
corpus_t* corpus_load_tailid	( 	const char * 	file,                                 int32(*)(char *str) 	validate,                                  int32(*)(char *s1, char *s2) 	dup_resolve                                  )

Parameters
file 	Input file name, the file must be seekable and rewindable

corpus_lookup
char* corpus_lookup	( 	corpus_t * 	corp,                       const char * 	id                    )

main
main(int32 argc, char *argv[]) {   corpus_t *ch, *ct; char id[4096], *str;
 * 1) if _CORPUS_TEST_

if (argc != 3) E_FATAL("Usage: %s headid-corpusfile tailid-corpusfile\n",               argv[0]);

ch = corpus_load_headid(argv[1], NULL, NULL); ct = corpus_load_tailid(argv[2], NULL, NULL); for { printf("> "); scanf("%s", id);

str = corpus_lookup(ch, id); if (str == NULL) printf("%s Not found in 1\n"); else printf("%s(1): %s\n", id, str);

str = corpus_lookup(ct, id); if (str == NULL) printf("%s Not found in 2\n"); else printf("%s(2): %s\n", id, str); } }
 * 1) endif

ctl_read_entry
int32 ctl_read_entry(FILE * fp,               char *uttfile,                int32 * sf,                int32 * ef,                char *uttid)

Parameters
fp 	In: an input file pointer uttfile 	Out: (Cep)file containing utterance data sf 	Out: Start frame in uttfile; 0 if omitted ef 	Out: End frame in uttfile; -1 (signifying until EOF) if omitted uttid 	Out: Utterance ID (generated from uttfile/sf/ef if omitted)

ctl_process
ptmr_t ctl_process(const char *ctlfile,           const char *ctllmfile,            const char *ctlmllrfile,            int32 nskip,            int32 count,            void (*func) (void *kb, utt_res_t * ur, int32 sf, int32 ef, char *uttid),            void *kb)

Return value
ptmr_t structure containing cpu/elapsed time stats for the run.

Parameters
ctlfile 	In: Control file to read; use stdin if NULL ctllmfile 	In: Control file that specify the lm used for the corresponding utterance ctlmllrfile 	In: Contorl file that specify the mllr used for the corresponding utterance nskip 	In: No. of entries to skip at the head count 	In: No. of entries to process after nskip func 	In: Function to be invoked for each of the count entries processed. kb 	In: A catch-all data pointer to be passed as the first argument to func above

ctl_process_utt
ptmr_t ctl_process_utt(const char *uttfile,               int32 count,                void (*func) (void *kb, utt_res_t * ur, int32 sf, int32 ef, char *uttid),                void *kb)

Returns

 * ptmr_t structure containing cpu/elapsed time stats for the run.

Parameters
uttfile 	In: Filename to be process (in its entirety) count 	In: No. of iterations to process uttfile func 	A function pointer that do the actual processing

ctl_infile
void ctl_infile(char *file,           const char *dir,            const char *ext,            const char *utt)

Parameters
file 	Out: Generated filename (allocated by caller) dir 	In: Optional directory spec if relative utt specified ext 	In: File extension to be appended to utt to generate complete filename utt 	In: Utterance file pathname, absolute or relative, with or without file extension. This is usually the first field in a control file

ctl_outfile
void ctl_outfile(char *file,           const char *dir,            const char *ext,	          const char *utt,            const char *uttid,            int build_dirs)

Parameters
file 	Out: Generated filename (allocated by caller) dir 	In: Directory for the generated filename; see comment for special handling of ,CTL suffix ext 	In: File-extension applied to the generated filename utt 	In: Utterance file pathname, absolute or relative, with or without extension. This is usually the first field in a control file. uttid 	In: Utterance ID (derived from the control file