Source code for montreal_forced_aligner.validation.corpus_validator

"""
Validating corpora
==================
"""
from __future__ import annotations

import logging
import os
import time
import typing
from decimal import Decimal
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional

import sqlalchemy

from montreal_forced_aligner.acoustic_modeling.trainer import TrainableAligner
from montreal_forced_aligner.alignment import PretrainedAligner
from montreal_forced_aligner.data import WorkflowType
from montreal_forced_aligner.db import Corpus, File, SoundFile, Speaker, TextFile, Utterance
from montreal_forced_aligner.exceptions import ConfigError, KaldiProcessingError
from montreal_forced_aligner.helper import comma_join, load_configuration, mfa_open
from montreal_forced_aligner.utils import log_kaldi_errors

if TYPE_CHECKING:
    from montreal_forced_aligner.abc import MetaDict


__all__ = ["TrainingValidator", "PretrainedValidator"]

logger = logging.getLogger("mfa")



[docs]
class ValidationMixin:
    """
    Mixin class for performing validation on a corpus

    Parameters
    ----------
    ignore_acoustics: bool
        Flag for whether feature generation and training/alignment should be skipped
    test_transcriptions: bool
        Flag for whether utterance transcriptions should be tested with a unigram language model
    phone_alignment: bool
        Flag for whether alignments should be compared to a phone-based system
    target_num_ngrams: int
        Target number of ngrams from speaker models to use

    See Also
    --------
    :class:`~montreal_forced_aligner.alignment.base.CorpusAligner`
        For corpus, dictionary, and alignment parameters

    """

    def __init__(
        self,
        ignore_acoustics: bool = False,
        test_transcriptions: bool = False,
        target_num_ngrams: int = 100,
        order: int = 3,
        method: str = "kneser_ney",
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.ignore_acoustics = ignore_acoustics
        self.test_transcriptions = test_transcriptions
        self.target_num_ngrams = target_num_ngrams
        self.order = order
        self.method = method

    @property
    def working_log_directory(self) -> str:
        """Working log directory"""
        return self.working_directory.joinpath("log")


[docs]
    def analyze_setup(self) -> None:
        """
        Analyzes the setup process and outputs info to the console
        """
        begin = time.time()

        with self.session() as session:
            sound_file_count = session.query(SoundFile).count()
            text_file_count = session.query(TextFile).count()
            total_duration = session.query(sqlalchemy.func.sum(Utterance.duration)).scalar()

        total_duration = Decimal(str(total_duration)).quantize(Decimal("0.001"))
        logger.debug(f"Duration calculation took {time.time() - begin:.3f} seconds")

        begin = time.time()
        ignored_count = len(self.no_transcription_files)
        ignored_count += len(self.textgrid_read_errors)
        ignored_count += len(self.decode_error_files)
        logger.debug(f"Ignored count calculation took {time.time() - begin:.3f} seconds")

        logger.info("Corpus")
        logger.info(f"{sound_file_count} sound files")
        logger.info(f"{text_file_count} text files")
        if len(self.no_transcription_files):
            logger.warning(
                f"{len(self.no_transcription_files)} sound files without corresponding transcriptions",
            )
        if len(self.decode_error_files):
            logger.error(f"{len(self.decode_error_files)} read errors for lab files")
        if len(self.textgrid_read_errors):
            logger.error(f"{len(self.textgrid_read_errors)} read errors for TextGrid files")

        logger.info(f"{self.num_speakers} speakers")
        logger.info(f"{self.num_utterances} utterances")
        logger.info(f"{total_duration} seconds total duration")
        self.analyze_wav_errors()
        self.analyze_missing_features()
        self.analyze_files_with_no_transcription()
        self.analyze_transcriptions_with_no_wavs()

        if len(self.decode_error_files):
            self.analyze_unreadable_text_files()
        if len(self.textgrid_read_errors):
            self.analyze_textgrid_read_errors()

        logger.info("Dictionary")
        self.analyze_oovs()



[docs]
    def analyze_oovs(self) -> None:
        """
        Analyzes OOVs in the corpus and constructs message
        """
        logger.info("Out of vocabulary words")
        output_dir = self.output_directory
        oov_path = os.path.join(output_dir, "oovs_found.txt")
        utterance_oov_path = os.path.join(output_dir, "utterance_oovs.txt")

        total_instances = 0
        with mfa_open(utterance_oov_path, "w") as f, self.session() as session:
            utterances = (
                session.query(
                    File.name,
                    File.relative_path,
                    Speaker.name,
                    Utterance.begin,
                    Utterance.end,
                    Utterance.oovs,
                )
                .join(Utterance.file)
                .join(Utterance.speaker)
                .filter(Utterance.oovs != None)  # noqa
                .filter(Utterance.oovs != "")
            )

            for file_name, relative_path, speaker_name, begin, end, oovs in utterances:
                total_instances += len(oovs)
                f.write(
                    f"{relative_path.joinpath(file_name)}, {speaker_name}: {begin}-{end}: {', '.join(oovs)}\n"
                )
                self.oovs_found.update(oovs)
        if self.oovs_found:
            self.save_oovs_found(self.output_directory)
            logger.warning(f"{len(self.oovs_found)} OOV word types")
            logger.warning(f"{total_instances}total OOV tokens")
            logger.warning(
                f"For a full list of the word types, please see: {oov_path}. "
                f"For a by-utterance breakdown of missing words, see: {utterance_oov_path}"
            )
        else:
            logger.info(
                "There were no missing words from the dictionary. If you plan on using the a model trained "
                "on this dataset to align other datasets in the future, it is recommended that there be at "
                "least some missing words."
            )



[docs]
    def analyze_wav_errors(self) -> None:
        """
        Analyzes any sound file issues in the corpus and constructs message
        """
        logger.info("Sound file read errors")

        output_dir = self.output_directory
        wav_read_errors = self.sound_file_errors
        if wav_read_errors:
            path = os.path.join(output_dir, "sound_file_errors.csv")
            with mfa_open(path, "w") as f:
                for p in wav_read_errors:
                    f.write(f"{p}\n")

            logger.error(
                f"There were {len(wav_read_errors)} issues reading sound files. "
                f"Please see {path} for a list."
            )
        else:
            logger.info("There were no issues reading sound files.")



[docs]
    def analyze_missing_features(self) -> None:
        """
        Analyzes issues in feature generation in the corpus and constructs message
        """
        logger.info("Feature generation")
        if self.ignore_acoustics:
            logger.info("Acoustic feature generation was skipped.")
            return
        output_dir = self.output_directory
        with self.session() as session:
            utterances = (
                session.query(File.name, File.relative_path, Utterance.begin, Utterance.end)
                .join(Utterance.file)
                .filter(Utterance.ignored == True)  # noqa
            )
            if utterances.count():
                path = os.path.join(output_dir, "missing_features.csv")
                with mfa_open(path, "w") as f:
                    for file_name, relative_path, begin, end in utterances:

                        f.write(f"{relative_path.joinpath(file_name)},{begin},{end}\n")

                logger.error(
                    f"There were {utterances.count()} utterances missing features. "
                    f"Please see {path} for a list."
                )
            else:
                logger.info("There were no utterances missing features.")



[docs]
    def analyze_files_with_no_transcription(self) -> None:
        """
        Analyzes issues with sound files that have no transcription files
        in the corpus and constructs message
        """
        logger.info("Files without transcriptions")
        output_dir = self.output_directory
        if self.no_transcription_files:
            path = os.path.join(output_dir, "missing_transcriptions.csv")
            with mfa_open(path, "w") as f:
                for file_path in self.no_transcription_files:
                    f.write(f"{file_path}\n")
            logger.error(
                f"There were {len(self.no_transcription_files)} sound files missing transcriptions."
            )
            logger.error(f"Please see {path} for a list.")
        else:
            logger.info("There were no sound files missing transcriptions.")



[docs]
    def analyze_transcriptions_with_no_wavs(self) -> None:
        """
        Analyzes issues with transcription that have no sound files
        in the corpus and constructs message
        """
        logger.info("Transcriptions without sound files")
        output_dir = self.output_directory
        if self.transcriptions_without_wavs:
            path = os.path.join(output_dir, "transcriptions_missing_sound_files.csv")
            with mfa_open(path, "w") as f:
                for file_path in self.transcriptions_without_wavs:
                    f.write(f"{file_path}\n")
            logger.error(
                f"There were {len(self.transcriptions_without_wavs)} transcription files missing sound files. "
                f"Please see {path} for a list."
            )
        else:
            logger.info("There were no transcription files missing sound files.")



[docs]
    def analyze_textgrid_read_errors(self) -> None:
        """
        Analyzes issues with reading TextGrid files
        in the corpus and constructs message
        """
        logger.info("TextGrid read errors")
        output_dir = self.output_directory
        if self.textgrid_read_errors:
            path = os.path.join(output_dir, "textgrid_read_errors.txt")
            with mfa_open(path, "w") as f:
                for e in self.textgrid_read_errors:
                    f.write(
                        f"The TextGrid file {e.file_name} gave the following error on load:\n\n{e}\n\n\n"
                    )
            logger.error(
                f"There were {len(self.textgrid_read_errors)} TextGrid files that could not be loaded. "
                f"For details, please see: {path}",
            )
        else:
            logger.info("There were no issues reading TextGrids.")



[docs]
    def analyze_unreadable_text_files(self) -> None:
        """
        Analyzes issues with reading text files
        in the corpus and constructs message
        """
        logger.info("Text file read errors")
        output_dir = self.output_directory
        if self.decode_error_files:
            path = os.path.join(output_dir, "utf8_read_errors.csv")
            with mfa_open(path, "w") as f:
                for file_path in self.decode_error_files:
                    f.write(f"{file_path}\n")
            logger.error(
                f"There were {len(self.decode_error_files)} text files that could not be read. "
                f"Please see {path} for a list."
            )
        else:
            logger.info("There were no issues reading text files.")



[docs]
    def test_utterance_transcriptions(self) -> None:
        """
        Tests utterance transcriptions with simple unigram models based on the utterance text and frequent
        words in the corpus

        Raises
        ------
        :class:`~montreal_forced_aligner.exceptions.KaldiProcessingError`
            If there were any errors in running Kaldi binaries
        """

        try:
            self.train_speaker_lms()

            self.transcribe(WorkflowType.per_speaker_transcription)

            logger.info("Test transcriptions")
            ser, wer, cer = self.compute_wer()
            if ser < 0.3:
                logger.info(f"{ser*100:.2f}% sentence error rate")
            elif ser < 0.8:
                logger.warning(f"{ser*100:.2f}% sentence error rate")
            else:
                logger.error(f"{ser*100:.2f}% sentence error rate")

            if wer < 0.25:
                logger.info(f"{wer*100:.2f}% word error rate")
            elif wer < 0.75:
                logger.warning(f"{wer*100:.2f}% word error rate")
            else:
                logger.error(f"{wer*100:.2f}% word error rate")

            if cer < 0.25:
                logger.info(f"{cer*100:.2f}% character error rate")
            elif cer < 0.75:
                logger.warning(f"{cer*100:.2f}% character error rate")
            else:
                logger.error(f"{cer*100:.2f}% character error rate")

            self.save_transcription_evaluation(self.output_directory)
            out_path = os.path.join(self.output_directory, "transcription_evaluation.csv")
            logger.info(f"See {out_path} for more details.")

        except Exception as e:
            if isinstance(e, KaldiProcessingError):
                log_kaldi_errors(e.error_logs)
                e.update_log_file()
            raise





[docs]
class TrainingValidator(TrainableAligner, ValidationMixin):
    """
    Validator class for checking whether a corpus and a dictionary will work together
    for training

    See Also
    --------
    :class:`~montreal_forced_aligner.acoustic_modeling.trainer.TrainableAligner`
        For training configuration
    :class:`~montreal_forced_aligner.validation.corpus_validator.ValidationMixin`
        For validation parameters

    Attributes
    ----------
    training_configs: dict[str, :class:`~montreal_forced_aligner.acoustic_modeling.monophone.MonophoneTrainer`]
    """

    def __init__(self, **kwargs):
        training_configuration = kwargs.pop("training_configuration", None)
        super().__init__(**kwargs)
        self.training_configs = {}
        if training_configuration is None:
            training_configuration = [("monophone", {})]
        for k, v in training_configuration:
            self.add_config(k, v)

    @property
    def working_directory(self) -> Path:
        if self.current_workflow.workflow_type in [
            WorkflowType.transcription,
            WorkflowType.per_speaker_transcription,
        ]:
            return self.output_directory.joinpath(self._current_workflow)
        return super().working_directory


[docs]
    @classmethod
    def parse_parameters(
        cls,
        config_path: Optional[Path] = None,
        args: Optional[Dict[str, Any]] = None,
        unknown_args: Optional[typing.Iterable[str]] = None,
    ) -> MetaDict:

        """
        Parse parameters for validation from a config path or command-line arguments

        Parameters
        ----------
        config_path: :class:`~pathlib.Path`
            Config path
        args: dict[str, Any]
            Parsed arguments
        unknown_args: list[str]
            Optional list of arguments that were not parsed

        Returns
        -------
        dict[str, Any]
            Configuration parameters
        """
        global_params = {}
        training_params = []
        use_default = True
        if config_path:
            data = load_configuration(config_path)
            for k, v in data.items():
                if k == "training":
                    for t in v:
                        for k2, v2 in t.items():
                            if "features" in v2:
                                global_params.update(v2["features"])
                                del v2["features"]
                            training_params.append((k2, v2))
                elif k == "features":
                    if "type" in v:
                        v["feature_type"] = v["type"]
                        del v["type"]
                    global_params.update(v)
                else:
                    if v is None and k in cls.nullable_fields:
                        v = []
                    global_params[k] = v
                if training_params:
                    use_default = False
        if use_default:  # default training configuration
            training_params.append(("monophone", {}))
        if training_params:
            if training_params[0][0] != "monophone":
                raise ConfigError("The first round of training must be monophone.")
        global_params["training_configuration"] = training_params
        global_params.update(cls.parse_args(args, unknown_args))
        return global_params



[docs]
    def setup(self) -> None:
        """
        Set up the corpus and validator

        Raises
        ------
        :class:`~montreal_forced_aligner.exceptions.KaldiProcessingError`
            If there were any errors in running Kaldi binaries
        """
        self.check_previous_run()
        if hasattr(self, "initialize_database"):
            self.initialize_database()
        if self.initialized:
            return
        try:
            all_begin = time.time()
            self.dictionary_setup()
            logger.debug(f"Loaded dictionary in {time.time() - all_begin:.3f} seconds")

            begin = time.time()
            self._load_corpus()
            logger.debug(f"Loaded corpus in {time.time() - begin:.3f} seconds")

            begin = time.time()
            self.initialize_jobs()
            logger.debug(f"Initialized jobs in {time.time() - begin:.3f} seconds")

            self.normalize_text()

            self.save_oovs_found(self.output_directory)

            begin = time.time()
            self.write_lexicon_information()
            self.write_training_information()
            if self.test_transcriptions:
                self.write_lexicon_information(write_disambiguation=True)
            logger.debug(f"Wrote lexicon information in {time.time() - begin:.3f} seconds")

            if self.ignore_acoustics:
                logger.info("Skipping acoustic feature generation")
            else:
                begin = time.time()
                self.generate_features()
                logger.debug(f"Generated features in {time.time() - begin:.3f} seconds")
                begin = time.time()
                self.save_oovs_found(self.output_directory)
                logger.debug(f"Calculated OOVs in {time.time() - begin:.3f} seconds")
                self.setup_trainers()

            self.initialized = True
        except Exception as e:
            if isinstance(e, KaldiProcessingError):
                log_kaldi_errors(e.error_logs)
                e.update_log_file()
            raise



[docs]
    def validate(self) -> None:
        """
        Performs validation of the corpus
        """
        begin = time.time()
        logger.debug(f"Setup took {time.time() - begin:.3f} seconds")
        self.setup()
        self.analyze_setup()
        logger.debug(f"Setup took {time.time() - begin:.3f} seconds")
        if self.ignore_acoustics:
            logger.info("Skipping test alignments.")
            return
        logger.info("Training")
        self.train()
        if self.test_transcriptions:
            self.test_utterance_transcriptions()
            self.get_phone_confidences()





[docs]
class PretrainedValidator(PretrainedAligner, ValidationMixin):
    """
    Validator class for checking whether a corpus, a dictionary, and
    an acoustic model will work together for alignment

    See Also
    --------
    :class:`~montreal_forced_aligner.alignment.pretrained.PretrainedAligner`
        For alignment configuration
    :class:`~montreal_forced_aligner.validation.corpus_validator.ValidationMixin`
        For validation parameters
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)


[docs]
    def setup(self) -> None:
        """
        Set up the corpus and validator

        Raises
        ------
        :class:`~montreal_forced_aligner.exceptions.KaldiProcessingError`
            If there were any errors in running Kaldi binaries
        """
        self.dirty = True  # Always reset validate
        self.initialize_database()
        if self.initialized:
            return
        try:
            self.setup_acoustic_model()
            self.dictionary_setup()
            self._load_corpus()
            self.initialize_jobs()
            self.normalize_text()

            self.save_oovs_found(self.output_directory)

            if self.ignore_acoustics:
                logger.info("Skipping acoustic feature generation")
            else:
                self.write_lexicon_information()

                if self.test_transcriptions:
                    self.write_lexicon_information(write_disambiguation=True)
                self.generate_features()
            self.acoustic_model.validate(self)
            self.acoustic_model.log_details()

            self.initialized = True
            logger.info("Finished initializing!")
        except Exception as e:
            if isinstance(e, KaldiProcessingError):
                log_kaldi_errors(e.error_logs)
                e.update_log_file()
            raise



[docs]
    def validate(self) -> None:
        """
        Performs validation of the corpus
        """
        self.initialize_database()
        self.create_new_current_workflow(WorkflowType.alignment)
        self.setup()
        self.analyze_setup()
        self.analyze_missing_phones()
        if self.ignore_acoustics:
            logger.info("Skipping test alignments.")
            return
        self.align()
        self.collect_alignments()
        if self.phone_confidence:
            self.get_phone_confidences()

        if self.use_phone_model:
            self.create_new_current_workflow(WorkflowType.phone_transcription)
            self.transcribe()
            self.collect_alignments()
        if self.test_transcriptions:
            self.test_utterance_transcriptions()
            self.collect_alignments()
            self.transcription_done = True
            with self.session() as session:
                session.query(Corpus).update({"transcription_done": True})
                session.commit()



[docs]
    def analyze_missing_phones(self) -> None:
        """Analyzes dictionary and acoustic model for phones in the dictionary that don't have acoustic models"""
        logger.info("Acoustic model compatibility")
        if self.excluded_pronunciation_count:
            logger.warning(len(self.excluded_phones), "phones not in acoustic model")
            logger.warning(self.excluded_pronunciation_count, "ignored pronunciations")

            logger.error(
                f"Phones missing acoustic models: {comma_join(sorted(self.excluded_phones))}"
            )
        else:
            logger.info("There were no phones in the dictionary without acoustic models.")