Source code for montreal_forced_aligner.validation.corpus_validator

"""
Validating corpora
==================
"""
from __future__ import annotations

import logging
import os
import time
import typing
from decimal import Decimal
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional

import sqlalchemy

from montreal_forced_aligner.acoustic_modeling.trainer import TrainableAligner
from montreal_forced_aligner.alignment import PretrainedAligner
from montreal_forced_aligner.data import WorkflowType
from montreal_forced_aligner.db import Corpus, File, SoundFile, Speaker, TextFile, Utterance
from montreal_forced_aligner.exceptions import ConfigError, KaldiProcessingError
from montreal_forced_aligner.helper import comma_join, load_configuration, mfa_open
from montreal_forced_aligner.utils import log_kaldi_errors

if TYPE_CHECKING:
    from montreal_forced_aligner.abc import MetaDict


__all__ = ["TrainingValidator", "PretrainedValidator"]

logger = logging.getLogger("mfa")


[docs] class ValidationMixin: """ Mixin class for performing validation on a corpus Parameters ---------- ignore_acoustics: bool Flag for whether feature generation and training/alignment should be skipped test_transcriptions: bool Flag for whether utterance transcriptions should be tested with a unigram language model phone_alignment: bool Flag for whether alignments should be compared to a phone-based system target_num_ngrams: int Target number of ngrams from speaker models to use See Also -------- :class:`~montreal_forced_aligner.alignment.base.CorpusAligner` For corpus, dictionary, and alignment parameters """ def __init__( self, ignore_acoustics: bool = False, test_transcriptions: bool = False, target_num_ngrams: int = 100, order: int = 3, method: str = "kneser_ney", **kwargs, ): super().__init__(**kwargs) self.ignore_acoustics = ignore_acoustics self.test_transcriptions = test_transcriptions self.target_num_ngrams = target_num_ngrams self.order = order self.method = method @property def working_log_directory(self) -> str: """Working log directory""" return self.working_directory.joinpath("log")
[docs] def analyze_setup(self) -> None: """ Analyzes the setup process and outputs info to the console """ begin = time.time() with self.session() as session: sound_file_count = session.query(SoundFile).count() text_file_count = session.query(TextFile).count() total_duration = session.query(sqlalchemy.func.sum(Utterance.duration)).scalar() total_duration = Decimal(str(total_duration)).quantize(Decimal("0.001")) logger.debug(f"Duration calculation took {time.time() - begin:.3f} seconds") begin = time.time() ignored_count = len(self.no_transcription_files) ignored_count += len(self.textgrid_read_errors) ignored_count += len(self.decode_error_files) logger.debug(f"Ignored count calculation took {time.time() - begin:.3f} seconds") logger.info("Corpus") logger.info(f"{sound_file_count} sound files") logger.info(f"{text_file_count} text files") if len(self.no_transcription_files): logger.warning( f"{len(self.no_transcription_files)} sound files without corresponding transcriptions", ) if len(self.decode_error_files): logger.error(f"{len(self.decode_error_files)} read errors for lab files") if len(self.textgrid_read_errors): logger.error(f"{len(self.textgrid_read_errors)} read errors for TextGrid files") logger.info(f"{self.num_speakers} speakers") logger.info(f"{self.num_utterances} utterances") logger.info(f"{total_duration} seconds total duration") self.analyze_wav_errors() self.analyze_missing_features() self.analyze_files_with_no_transcription() self.analyze_transcriptions_with_no_wavs() if len(self.decode_error_files): self.analyze_unreadable_text_files() if len(self.textgrid_read_errors): self.analyze_textgrid_read_errors() logger.info("Dictionary") self.analyze_oovs()
[docs] def analyze_oovs(self) -> None: """ Analyzes OOVs in the corpus and constructs message """ logger.info("Out of vocabulary words") output_dir = self.output_directory oov_path = os.path.join(output_dir, "oovs_found.txt") utterance_oov_path = os.path.join(output_dir, "utterance_oovs.txt") total_instances = 0 with mfa_open(utterance_oov_path, "w") as f, self.session() as session: utterances = ( session.query( File.name, File.relative_path, Speaker.name, Utterance.begin, Utterance.end, Utterance.oovs, ) .join(Utterance.file) .join(Utterance.speaker) .filter(Utterance.oovs != None) # noqa .filter(Utterance.oovs != "") ) for file_name, relative_path, speaker_name, begin, end, oovs in utterances: total_instances += len(oovs) f.write( f"{relative_path.joinpath(file_name)}, {speaker_name}: {begin}-{end}: {', '.join(oovs)}\n" ) self.oovs_found.update(oovs) if self.oovs_found: self.save_oovs_found(self.output_directory) logger.warning(f"{len(self.oovs_found)} OOV word types") logger.warning(f"{total_instances}total OOV tokens") logger.warning( f"For a full list of the word types, please see: {oov_path}. " f"For a by-utterance breakdown of missing words, see: {utterance_oov_path}" ) else: logger.info( "There were no missing words from the dictionary. If you plan on using the a model trained " "on this dataset to align other datasets in the future, it is recommended that there be at " "least some missing words." )
[docs] def analyze_wav_errors(self) -> None: """ Analyzes any sound file issues in the corpus and constructs message """ logger.info("Sound file read errors") output_dir = self.output_directory wav_read_errors = self.sound_file_errors if wav_read_errors: path = os.path.join(output_dir, "sound_file_errors.csv") with mfa_open(path, "w") as f: for p in wav_read_errors: f.write(f"{p}\n") logger.error( f"There were {len(wav_read_errors)} issues reading sound files. " f"Please see {path} for a list." ) else: logger.info("There were no issues reading sound files.")
[docs] def analyze_missing_features(self) -> None: """ Analyzes issues in feature generation in the corpus and constructs message """ logger.info("Feature generation") if self.ignore_acoustics: logger.info("Acoustic feature generation was skipped.") return output_dir = self.output_directory with self.session() as session: utterances = ( session.query(File.name, File.relative_path, Utterance.begin, Utterance.end) .join(Utterance.file) .filter(Utterance.ignored == True) # noqa ) if utterances.count(): path = os.path.join(output_dir, "missing_features.csv") with mfa_open(path, "w") as f: for file_name, relative_path, begin, end in utterances: f.write(f"{relative_path.joinpath(file_name)},{begin},{end}\n") logger.error( f"There were {utterances.count()} utterances missing features. " f"Please see {path} for a list." ) else: logger.info("There were no utterances missing features.")
[docs] def analyze_files_with_no_transcription(self) -> None: """ Analyzes issues with sound files that have no transcription files in the corpus and constructs message """ logger.info("Files without transcriptions") output_dir = self.output_directory if self.no_transcription_files: path = os.path.join(output_dir, "missing_transcriptions.csv") with mfa_open(path, "w") as f: for file_path in self.no_transcription_files: f.write(f"{file_path}\n") logger.error( f"There were {len(self.no_transcription_files)} sound files missing transcriptions." ) logger.error(f"Please see {path} for a list.") else: logger.info("There were no sound files missing transcriptions.")
[docs] def analyze_transcriptions_with_no_wavs(self) -> None: """ Analyzes issues with transcription that have no sound files in the corpus and constructs message """ logger.info("Transcriptions without sound files") output_dir = self.output_directory if self.transcriptions_without_wavs: path = os.path.join(output_dir, "transcriptions_missing_sound_files.csv") with mfa_open(path, "w") as f: for file_path in self.transcriptions_without_wavs: f.write(f"{file_path}\n") logger.error( f"There were {len(self.transcriptions_without_wavs)} transcription files missing sound files. " f"Please see {path} for a list." ) else: logger.info("There were no transcription files missing sound files.")
[docs] def analyze_textgrid_read_errors(self) -> None: """ Analyzes issues with reading TextGrid files in the corpus and constructs message """ logger.info("TextGrid read errors") output_dir = self.output_directory if self.textgrid_read_errors: path = os.path.join(output_dir, "textgrid_read_errors.txt") with mfa_open(path, "w") as f: for e in self.textgrid_read_errors: f.write( f"The TextGrid file {e.file_name} gave the following error on load:\n\n{e}\n\n\n" ) logger.error( f"There were {len(self.textgrid_read_errors)} TextGrid files that could not be loaded. " f"For details, please see: {path}", ) else: logger.info("There were no issues reading TextGrids.")
[docs] def analyze_unreadable_text_files(self) -> None: """ Analyzes issues with reading text files in the corpus and constructs message """ logger.info("Text file read errors") output_dir = self.output_directory if self.decode_error_files: path = os.path.join(output_dir, "utf8_read_errors.csv") with mfa_open(path, "w") as f: for file_path in self.decode_error_files: f.write(f"{file_path}\n") logger.error( f"There were {len(self.decode_error_files)} text files that could not be read. " f"Please see {path} for a list." ) else: logger.info("There were no issues reading text files.")
[docs] def test_utterance_transcriptions(self) -> None: """ Tests utterance transcriptions with simple unigram models based on the utterance text and frequent words in the corpus Raises ------ :class:`~montreal_forced_aligner.exceptions.KaldiProcessingError` If there were any errors in running Kaldi binaries """ try: self.train_speaker_lms() self.transcribe(WorkflowType.per_speaker_transcription) logger.info("Test transcriptions") ser, wer, cer = self.compute_wer() if ser < 0.3: logger.info(f"{ser*100:.2f}% sentence error rate") elif ser < 0.8: logger.warning(f"{ser*100:.2f}% sentence error rate") else: logger.error(f"{ser*100:.2f}% sentence error rate") if wer < 0.25: logger.info(f"{wer*100:.2f}% word error rate") elif wer < 0.75: logger.warning(f"{wer*100:.2f}% word error rate") else: logger.error(f"{wer*100:.2f}% word error rate") if cer < 0.25: logger.info(f"{cer*100:.2f}% character error rate") elif cer < 0.75: logger.warning(f"{cer*100:.2f}% character error rate") else: logger.error(f"{cer*100:.2f}% character error rate") self.save_transcription_evaluation(self.output_directory) out_path = os.path.join(self.output_directory, "transcription_evaluation.csv") logger.info(f"See {out_path} for more details.") except Exception as e: if isinstance(e, KaldiProcessingError): log_kaldi_errors(e.error_logs) e.update_log_file() raise
[docs] class TrainingValidator(TrainableAligner, ValidationMixin): """ Validator class for checking whether a corpus and a dictionary will work together for training See Also -------- :class:`~montreal_forced_aligner.acoustic_modeling.trainer.TrainableAligner` For training configuration :class:`~montreal_forced_aligner.validation.corpus_validator.ValidationMixin` For validation parameters Attributes ---------- training_configs: dict[str, :class:`~montreal_forced_aligner.acoustic_modeling.monophone.MonophoneTrainer`] """ def __init__(self, **kwargs): training_configuration = kwargs.pop("training_configuration", None) super().__init__(**kwargs) self.training_configs = {} if training_configuration is None: training_configuration = [("monophone", {})] for k, v in training_configuration: self.add_config(k, v) @property def working_directory(self) -> Path: if self.current_workflow.workflow_type in [ WorkflowType.transcription, WorkflowType.per_speaker_transcription, ]: return self.output_directory.joinpath(self._current_workflow) return super().working_directory
[docs] @classmethod def parse_parameters( cls, config_path: Optional[Path] = None, args: Optional[Dict[str, Any]] = None, unknown_args: Optional[typing.Iterable[str]] = None, ) -> MetaDict: """ Parse parameters for validation from a config path or command-line arguments Parameters ---------- config_path: :class:`~pathlib.Path` Config path args: dict[str, Any] Parsed arguments unknown_args: list[str] Optional list of arguments that were not parsed Returns ------- dict[str, Any] Configuration parameters """ global_params = {} training_params = [] use_default = True if config_path: data = load_configuration(config_path) for k, v in data.items(): if k == "training": for t in v: for k2, v2 in t.items(): if "features" in v2: global_params.update(v2["features"]) del v2["features"] training_params.append((k2, v2)) elif k == "features": if "type" in v: v["feature_type"] = v["type"] del v["type"] global_params.update(v) else: if v is None and k in cls.nullable_fields: v = [] global_params[k] = v if training_params: use_default = False if use_default: # default training configuration training_params.append(("monophone", {})) if training_params: if training_params[0][0] != "monophone": raise ConfigError("The first round of training must be monophone.") global_params["training_configuration"] = training_params global_params.update(cls.parse_args(args, unknown_args)) return global_params
[docs] def setup(self) -> None: """ Set up the corpus and validator Raises ------ :class:`~montreal_forced_aligner.exceptions.KaldiProcessingError` If there were any errors in running Kaldi binaries """ self.check_previous_run() if hasattr(self, "initialize_database"): self.initialize_database() if self.initialized: return try: all_begin = time.time() self.dictionary_setup() logger.debug(f"Loaded dictionary in {time.time() - all_begin:.3f} seconds") begin = time.time() self._load_corpus() logger.debug(f"Loaded corpus in {time.time() - begin:.3f} seconds") begin = time.time() self.initialize_jobs() logger.debug(f"Initialized jobs in {time.time() - begin:.3f} seconds") self.normalize_text() self.save_oovs_found(self.output_directory) begin = time.time() self.write_lexicon_information() self.write_training_information() if self.test_transcriptions: self.write_lexicon_information(write_disambiguation=True) logger.debug(f"Wrote lexicon information in {time.time() - begin:.3f} seconds") if self.ignore_acoustics: logger.info("Skipping acoustic feature generation") else: begin = time.time() self.generate_features() logger.debug(f"Generated features in {time.time() - begin:.3f} seconds") begin = time.time() self.save_oovs_found(self.output_directory) logger.debug(f"Calculated OOVs in {time.time() - begin:.3f} seconds") self.setup_trainers() self.initialized = True except Exception as e: if isinstance(e, KaldiProcessingError): log_kaldi_errors(e.error_logs) e.update_log_file() raise
[docs] def validate(self) -> None: """ Performs validation of the corpus """ begin = time.time() logger.debug(f"Setup took {time.time() - begin:.3f} seconds") self.setup() self.analyze_setup() logger.debug(f"Setup took {time.time() - begin:.3f} seconds") if self.ignore_acoustics: logger.info("Skipping test alignments.") return logger.info("Training") self.train() if self.test_transcriptions: self.test_utterance_transcriptions() self.get_phone_confidences()
[docs] class PretrainedValidator(PretrainedAligner, ValidationMixin): """ Validator class for checking whether a corpus, a dictionary, and an acoustic model will work together for alignment See Also -------- :class:`~montreal_forced_aligner.alignment.pretrained.PretrainedAligner` For alignment configuration :class:`~montreal_forced_aligner.validation.corpus_validator.ValidationMixin` For validation parameters """ def __init__(self, **kwargs): super().__init__(**kwargs)
[docs] def setup(self) -> None: """ Set up the corpus and validator Raises ------ :class:`~montreal_forced_aligner.exceptions.KaldiProcessingError` If there were any errors in running Kaldi binaries """ self.dirty = True # Always reset validate self.initialize_database() if self.initialized: return try: self.setup_acoustic_model() self.dictionary_setup() self._load_corpus() self.initialize_jobs() self.normalize_text() self.save_oovs_found(self.output_directory) if self.ignore_acoustics: logger.info("Skipping acoustic feature generation") else: self.write_lexicon_information() if self.test_transcriptions: self.write_lexicon_information(write_disambiguation=True) self.generate_features() self.acoustic_model.validate(self) self.acoustic_model.log_details() self.initialized = True logger.info("Finished initializing!") except Exception as e: if isinstance(e, KaldiProcessingError): log_kaldi_errors(e.error_logs) e.update_log_file() raise
[docs] def validate(self) -> None: """ Performs validation of the corpus """ self.initialize_database() self.create_new_current_workflow(WorkflowType.alignment) self.setup() self.analyze_setup() self.analyze_missing_phones() if self.ignore_acoustics: logger.info("Skipping test alignments.") return self.align() self.collect_alignments() if self.phone_confidence: self.get_phone_confidences() if self.use_phone_model: self.create_new_current_workflow(WorkflowType.phone_transcription) self.transcribe() self.collect_alignments() if self.test_transcriptions: self.test_utterance_transcriptions() self.collect_alignments() self.transcription_done = True with self.session() as session: session.query(Corpus).update({"transcription_done": True}) session.commit()
[docs] def analyze_missing_phones(self) -> None: """Analyzes dictionary and acoustic model for phones in the dictionary that don't have acoustic models""" logger.info("Acoustic model compatibility") if self.excluded_pronunciation_count: logger.warning(len(self.excluded_phones), "phones not in acoustic model") logger.warning(self.excluded_pronunciation_count, "ignored pronunciations") logger.error( f"Phones missing acoustic models: {comma_join(sorted(self.excluded_phones))}" ) else: logger.info("There were no phones in the dictionary without acoustic models.")