Source code for montreal_forced_aligner.data

"""
Data classes
============

"""
from __future__ import annotations

import collections
import enum
import io
import itertools
import math
import re
import typing
from pathlib import Path

import dataclassy
import pynini
import pywrapfst
from praatio.utilities.constants import Interval, TextgridFormats
from sqlalchemy.orm import scoped_session

from montreal_forced_aligner.exceptions import CtmError

__all__ = [
    "MfaArguments",
    "CtmInterval",
    "TextFileType",
    "TextgridFormats",
    "SoundFileType",
    "WordType",
    "PhoneType",
    "PhoneSetType",
    "WordData",
    "DatabaseImportData",
    "PronunciationProbabilityCounter",
    "ManifoldAlgorithm",
    "ClusterType",
    "DistanceMetric",
    "WorkflowType",
    "DatasetType",
    "Language",
    "ArpaNgramModel",
    "WORD_BEGIN_SYMBOL",
    "WORD_END_SYMBOL",
    "OOV_WORD",
    "BRACKETED_WORD",
    "LAUGHTER_WORD",
    "CUTOFF_WORD",
    "SIL_WORD",
    "SIL_PHONE",
    "OOV_PHONE",
]

WORD_BEGIN_SYMBOL = "#1"
WORD_END_SYMBOL = "#2"
OOV_WORD = "<unk>"
BRACKETED_WORD = "<bracketed>"
LAUGHTER_WORD = "[laughter]"
CUTOFF_WORD = "<cutoff>"
SIL_WORD = "<eps>"
SIL_PHONE = "sil"
OOV_PHONE = "spn"


# noinspection PyUnresolvedReferences

[docs]
@dataclassy.dataclass(slots=True)
class DatabaseImportData:
    """
    Class for storing information on importing data into the database

    Parameters
    ----------
    speaker_objects: list[dict[str, Any]]
        List of dictionaries with :class:`~montreal_forced_aligner.db.Speaker` properties
    file_objects: list[dict[str, Any]]
        List of dictionaries with :class:`~montreal_forced_aligner.db.File` properties
    text_file_objects: list[dict[str, Any]]
        List of dictionaries with :class:`~montreal_forced_aligner.db.TextFile` properties
    sound_file_objects: list[dict[str, Any]]
        List of dictionaries with :class:`~montreal_forced_aligner.db.SoundFile` properties
    speaker_ordering_objects: list[dict[str, Any]]
        List of dictionaries with :class:`~montreal_forced_aligner.db.SpeakerOrdering` properties
    utterance_objects: list[dict[str, Any]]
        List of dictionaries with :class:`~montreal_forced_aligner.db.Utterance` properties
    """

    speaker_objects: typing.List[typing.Dict[str, typing.Any]] = dataclassy.factory(list)
    file_objects: typing.List[typing.Dict[str, typing.Any]] = dataclassy.factory(list)
    text_file_objects: typing.List[typing.Dict[str, typing.Any]] = dataclassy.factory(list)
    sound_file_objects: typing.List[typing.Dict[str, typing.Any]] = dataclassy.factory(list)
    speaker_ordering_objects: typing.List[typing.Dict[str, typing.Any]] = dataclassy.factory(list)
    utterance_objects: typing.List[typing.Dict[str, typing.Any]] = dataclassy.factory(list)


[docs]
    def add_objects(self, other_import: DatabaseImportData) -> None:
        """
        Combine objects for two importers

        Parameters
        ----------
        other_import: :class:`~montreal_forced_aligner.data.DatabaseImportData`
            Other object with objects to import
        """
        self.speaker_objects.extend(other_import.speaker_objects)
        self.file_objects.extend(other_import.file_objects)
        self.text_file_objects.extend(other_import.text_file_objects)
        self.sound_file_objects.extend(other_import.sound_file_objects)
        self.speaker_ordering_objects.extend(other_import.speaker_ordering_objects)
        self.utterance_objects.extend(other_import.utterance_objects)




# noinspection PyUnresolvedReferences
@dataclassy.dataclass(slots=True)
class PhonologicalRule:
    preceding_context: typing.List[typing.Set]
    segment: typing.List[typing.Set]
    following_context: typing.List[typing.Set]
    replacement: typing.Optional[typing.List[str]]
    dialect: typing.Optional[str] = None
    probability: typing.Optional[float] = None
    initial: bool = False
    final: bool = False

    def apply_rule(self, pronunciation: str) -> str:
        """
        Apply the rule on a pronunciation by replacing any matching segments with the replacement

        Parameters
        ----------
        pronunciation: str
            Pronunciation to apply rule

        Returns
        -------
        str
            Pronunciation with rule applied
        """
        preceding = self.preceding_regex
        following = self.following_regex
        if preceding.startswith("^"):
            preceding = preceding.replace("^", "").strip()
        if following.startswith("$"):
            following = following.replace("$", "").strip()
        components = []
        if preceding:
            components.append(r"\g<preceding>")
        if self.replacement_regex:
            components.append(self.replacement_regex)
        if following:
            components.append(r"\g<following>")
        return self.match_regex.sub(" ".join(components), pronunciation).strip()

    @property
    def total_input_length(self):
        return len(self.preceding_context) + len(self.segment) + len(self.following_context)

    @property
    def preceding_regex(self):
        components = []
        for phones in self.preceding_context:
            components.append(f'({"|".join(phones)})')
        pattern = " ".join(components)
        if self.initial:
            pattern = "^" + pattern
        return pattern

    @property
    def following_regex(self):
        components = []
        for phones in self.following_context:
            components.append(f'({"|".join(phones)})')
        pattern = " ".join(components)
        if self.final:
            pattern += "$"
        return pattern

    @property
    def segment_regex(self):
        components = []
        for phones in self.segment:
            components.append(f'({"|".join(phones)})')
        return " ".join(components)

    @property
    def replacement_regex(self):
        return " ".join(self.replacement)

    @property
    def unapplied_pattern(self) -> re.Pattern:
        if not hasattr(self, "_unapplied_pattern"):
            components = []
            preceding = self.preceding_regex
            following = self.following_regex
            if preceding.startswith("^"):
                preceding = preceding.replace("^", "").strip()
            if following.endswith("$"):
                following = following.replace("$", "").strip()
            if preceding:
                components.append(rf"(?P<preceding>{preceding})")
            if self.segment:
                components.append(rf"(?P<segment>{self.segment_regex})")
            if following:
                components.append(rf"(?P<following>{following})")
            pattern = " ".join(components)
            if self.initial:
                pattern = "^" + pattern
            if self.final:
                pattern += "$"
            return re.compile(pattern, flags=re.UNICODE)
        return self._unapplied_pattern

    def to_json(self) -> typing.Dict[str, typing.Any]:
        """
        Serializes the rule for export

        Returns
        -------
        dict[str, Any]
            Serialized rule
        """
        return {
            "segment": self.segment_regex,
            "dialect": self.dialect,
            "preceding_context": self.preceding_regex,
            "following_context": self.following_regex,
            "replacement": self.replacement_regex,
            "probability": self.probability,
        }

    @property
    def applied_pattern(self):
        if not hasattr(self, "_applied_pattern"):
            components = []
            preceding = self.preceding_regex
            following = self.following_regex
            if preceding.startswith("^"):
                preceding = preceding.replace("^", "").strip()
            if following.endswith("$"):
                following = following.replace("$", "").strip()
            if preceding:
                components.append(rf"(?P<preceding>{preceding})")
            if self.replacement_regex:
                components.append(rf"(?P<replacement>{self.replacement_regex})")
            if following:
                components.append(rf"(?P<following>{following})")
            pattern = " ".join(components)
            if self.initial:
                pattern = "^" + pattern
            if self.final:
                pattern += "$"
            return re.compile(pattern, flags=re.UNICODE)
        return self._applied_pattern

    @property
    def replacement_pairs(self):
        input = [x for x in self.segment]
        output = [x for x in self.replacement]
        while len(input) != len(output):
            if len(input) < len(output):
                input.append(["<eps>"])
            if len(output) < len(input):
                output.append("<eps>")
        return list(zip(input, output))


# noinspection PyUnresolvedReferences

[docs]
@dataclassy.dataclass(slots=True)
class MfaArguments:
    """
    Base class for argument classes for MFA functions

    Attributes
    ----------
    job_name: int
        Integer ID of the job
    db_string: str
        String for database connections
    log_path: :class:`~pathlib.Path`
        Path to save logging information during the run
    """

    job_name: int
    session: typing.Union[scoped_session, str]
    log_path: Path




[docs]
class TextFileType(enum.Enum):
    """Enum for types of text files"""

    NONE = "none"  #: No text file
    TEXTGRID = TextgridFormats.LONG_TEXTGRID  #: Praat's long textgrid format
    SHORT_TEXTGRID = TextgridFormats.SHORT_TEXTGRID  #: Praat's short textgrid format
    LAB = "lab"  #: Text file
    JSON = TextgridFormats.JSON  #: JSON

    def __str__(self) -> str:
        """Name of phone set"""
        return self.value



class DatasetType(enum.Enum):
    """Enum for types of sound files"""

    NONE = 0  #: Nothing has been imported
    ACOUSTIC_CORPUS = 1  #: Imported corpus with sound files (and maybe text files)
    TEXT_CORPUS = 2  #: Imported corpus with just text files
    ACOUSTIC_CORPUS_WITH_DICTIONARY = (
        3  #: Imported corpus and pronunciation dictionary with sound files
    )
    TEXT_CORPUS_WITH_DICTIONARY = (
        4  #: Imported corpus and pronunciation dictionary with just text files
    )
    DICTIONARY = 5  #: Only imported pronunciation dictionary (for G2P)



[docs]
class SoundFileType(enum.Enum):
    """Enum for types of sound files"""

    NONE = 0  #: No sound file
    WAV = 1  #: Can be read as a .wav file
    SOX = 2  #: Needs to use SoX to preprocess



def voiceless_variants(base_phone) -> typing.Set[str]:
    """
    Generate variants of voiceless IPA phones

    Parameters
    ----------
    base_phone: str
        Voiceless IPA phone

    Returns
    -------
    set[str]
        Set of base_phone plus variants
    """
    return {base_phone + d for d in ["", "ʱ", "ʼ", "ʰ", "ʲ", "ʷ", "ˠ", "ˀ", "̚", "͈"]}


def voiced_variants(base_phone) -> typing.Set[str]:
    """
    Generate variants of voiced IPA phones

    Parameters
    ----------
    base_phone: str
        Voiced IPA phone

    Returns
    -------
    set[str]
        Set of base_phone plus variants
    """
    return {base_phone + d for d in ["", "ʱ", "ʲ", "ʷ", "ⁿ", "ˠ", "̚"]} | {
        d + base_phone for d in ["ⁿ"]
    }



[docs]
class PhoneType(enum.Enum):
    """Enum for types of phones"""

    non_silence = 1  #: Speech sounds
    silence = 2  #: Silence phones
    oov = 3  #: Out of vocabulary/spoken noise phones
    disambiguation = 4  #: Disambiguation phones internal to Kaldi
    extra = 5  #: Phones not to be included generally, i.e., loaded from reference intervals




[docs]
class WorkflowType(enum.Enum):
    """
    Enum for workflows involving corpora

    Parameters
    ----------
    reference: int
        Load alignments from reference directory
    alignment: int
        Align using corpus texts, acoustic model, and pronunciation dictionary
    transcription: int
        Transcribe using acoustic model, pronunciation dictionary, and language model
    phone_transcription: int
        Transcribe using acoustic model and phone-based language model
    per_speaker_transcription: int
        Transcribe using acoustic model, pronunciation dictionary, and per-speaker language model generated by corpus texts
    speaker_diarization: int
        Diarize speakers
    online_alignment: int
        Online alignment
    acoustic_training: int
        Acoustic model training
    acoustic_model_adaptation: int
        Acoustic model adaptation
    segmentation: int
        Segment based on speech activity
    """

    reference = 0
    alignment = 1
    transcription = 2
    phone_transcription = 3
    per_speaker_transcription = 4
    speaker_diarization = 5
    online_alignment = 6
    acoustic_training = 7
    acoustic_model_adaptation = 8
    segmentation = 9
    train_g2p = 10
    g2p = 11
    language_model_training = 12
    tokenizer_training = 13




[docs]
class WordType(enum.Enum):
    """Enum for types of words"""

    speech = 1  #: General speech words
    clitic = 2  #: Clitics that must attach to words
    silence = 3  #: Words representing silence
    oov = 4  #: Words representing out of vocabulary items
    bracketed = 5  #: Words that are in brackets
    cutoff = 6  #: Words that are cutoffs of particular words or hesitations of the next word
    laughter = 7  #: Words that represent laughter
    noise = 8  #: Words that represent non-speech noise
    music = 9  #: Words that represent music
    disambiguation = 10  #: Disambiguation symbols internal to Kaldi
    interjection = 11  #: Set of words that can be added on the fly to transcripts

    @classmethod
    def speech_types(cls):
        return {cls.speech, cls.clitic, cls.interjection}

    @classmethod
    def non_speech_types(cls):
        return {
            cls.silence,
            cls.oov,
            cls.bracketed,
            cls.cutoff,
            cls.laughter,
            cls.noise,
            cls.music,
        }



class DistanceMetric(enum.Enum):
    cosine = "cosine"
    plda = "plda"
    euclidean = "euclidean"


class ClusterType(enum.Enum):
    """Enum for supported clustering algorithms"""

    mfa = "mfa"
    affinity = "affinity"
    agglomerative = "agglomerative"
    spectral = "spectral"
    dbscan = "dbscan"
    hdbscan = "hdbscan"
    optics = "optics"
    kmeans = "kmeans"
    meanshift = "meanshift"


class Language(enum.Enum):
    """Enum for supported languages"""

    unknown = "unknown"
    catalan = "catalan"
    chinese = "chinese"
    croatian = "croatian"
    danish = "danish"
    dutch = "dutch"
    english = "english"
    finnish = "finnish"
    french = "french"
    german = "german"
    greek = "greek"
    italian = "italian"
    japanese = "japanese"
    korean = "korean"
    lithuanian = "lithuanian"
    macedonian = "macedonian"
    multilingual = "multilingual"
    norwegian = "norwegian"
    polish = "polish"
    portuguese = "portuguese"
    romanian = "romanian"
    russian = "russian"
    slovenian = "slovenian"
    spanish = "spanish"
    swedish = "swedish"
    thai = "thai"
    ukrainian = "ukrainian"

    def __str__(self) -> str:
        """Name of phone set"""
        return self.name


class ManifoldAlgorithm(enum.Enum):
    """Enum for supported manifold visualization algorithms"""

    tsne = "tsne"
    mds = "mds"
    spectral = "spectral"
    isomap = "isomap"



[docs]
class PhoneSetType(enum.Enum):
    """Enum for types of phone sets"""

    UNKNOWN = "UNKNOWN"  #: Unknown
    AUTO = "AUTO"  #: Inspect dictionary to pick the most common phone set type
    IPA = "IPA"  #: IPA-based phoneset
    ARPA = "ARPA"  #: US English-based Arpabet
    PINYIN = "PINYIN"  #: Pinyin for Mandarin

    def __str__(self) -> str:
        """Name of phone set"""
        return self.name

    @property
    def has_base_phone_regex(self) -> bool:
        """Check for whether a base phone regex is available"""
        return self is PhoneSetType.IPA or self is PhoneSetType.ARPA or self is PhoneSetType.PINYIN

    @property
    def regex_detect(self) -> typing.Optional[re.Pattern]:
        """Pattern for detecting a phone set type"""
        if self is PhoneSetType.ARPA:
            return re.compile(r"[A-Z]{2}[012]")
        elif self is PhoneSetType.PINYIN:
            return re.compile(r"[a-z]{1,3}[12345]")
        elif self is PhoneSetType.IPA:
            return re.compile(
                r"[əɚʊɡɤʁɹɔɛʉɒβɲɟʝŋʃɕʰʲɾ̃̚ː˩˨˧˦˥̪̝̟̥̂̀̄ˑ̊ᵝ̠̹̞̩̯̬̺ˀˤ̻̙̘̰̤̜̑̽᷈᷄᷅̌̋̏‿̆͜͡ˌˈ̣]"
            )
        return None

    @property
    def suprasegmental_phone_regex(self) -> typing.Optional[re.Pattern]:
        """Regex for creating base phones"""
        if self is PhoneSetType.IPA:
            return re.compile(r"([ː̟̥̂̀̄ˑ̊ᵝ̠̹̞̩̯̬̺ˤ̻̙̘̤̜̑̽᷈᷄᷅̌̋̏‿̆͜͡ˌ̍ʱʰʲʷ̚ʼ͈ˈ̣]+)")
        return None

    @property
    def base_phone_regex(self) -> typing.Optional[re.Pattern]:
        """Regex for creating base phones"""
        if self is PhoneSetType.ARPA:
            return re.compile(r"[012]")
        elif self is PhoneSetType.PINYIN:
            return re.compile(r"[12345]")
        elif self is PhoneSetType.IPA:
            return re.compile(r"([ː˩˨˧˦˥̟̥̂̀̄ˑʱʰʲʷ̊ᵝ̠̹̞̩̯̬̺ˀˤ̻̙̘̤̜̑̽᷈᷄᷅̌̋̏‿̆͜͡ˌ̍ˈʼ͈̚]+)")
        return None

    @property
    def voiceless_obstruents(self) -> typing.Set[str]:
        """Voiceless obstruents for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "p",
                "t",
                "ʈ",
                "k",
                "c",
                "q",
                "f",
                "s",
                "ʂ",
                "s̪",
                "ɕ",
                "x",
                "ç",
                "ɸ",
                "χ",
                "ʃ",
                "h",
                "ʜ",
                "ħ",
                "ʡ",
                "ʔ",
                "θ",
                "ɬ",
                "ɧ",
            }
        elif self is PhoneSetType.ARPA:
            return {"P", "T", "CH", "SH", "S", "F", "TH", "HH", "K"}
        return set()

    @property
    def voiced_obstruents(self) -> typing.Set[str]:
        """Voiced obstruents for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "b",
                "β",
                "d",
                "g",
                "ɖ",
                "ɡ",
                "ɟ",
                "ɢ",
                "v",
                "z̪",
                "z",
                "ʐ",
                "ʑ",
                "ɣ",
                "ʁ",
                "ʢ",
                "ʕ",
                "ʒ",
                "ʝ",
                "ɦ",
                "ð",
                "ɮ",
            }
        elif self is PhoneSetType.ARPA:
            return {"B", "D", "DH", "JH", "ZH", "Z", "V", "DH", "G"}
        return set()

    @property
    def implosive_obstruents(self) -> typing.Set[str]:
        """Implosive obstruents for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɓ", "ɗ", "ʄ", "ɠ", "ʛ", "ᶑ", "ɗ̪"}
        return set()

    @property
    def stops(self) -> typing.Set[str]:
        """Stops for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "p",
                "t",
                "t̪",
                "ʈ",
                "c",
                "k",
                "q",
                "kp",
                "pk",
                "b",
                "d",
                "d̪",
                "ɖ",
                "ɟ",
                "ɡ",
                "ɢ",
                "bɡ",
                "ɡb",
                "ɓ",
                "ɗ",
                "ʄ",
                "ɠ",
                "ʛ",
                "ᶑ",
                "ɗ̪",
                "ʔ",
                "ʡ",
            }
        elif self is PhoneSetType.ARPA:
            return {"B", "D", "P", "T", "G", "K"}
        return set()

    @property
    def sibilants(self) -> typing.Set[str]:
        """Sibilants for the phone set"""
        if self is PhoneSetType.IPA:
            return {"s", "s̪", "ʃ", "ʂ", "ɕ", "z", "z̪", "ʒ", "ʑ", "ʐ", "ɧ"}
        elif self is PhoneSetType.ARPA:
            return {"SH", "S", "ZH", "Z"}
        return set()

    @property
    def affricates(self) -> typing.Set[str]:
        """Affricates for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "pf",
                "ts",
                "t̪s̪",
                "tʃ",
                "tɕ",
                "tʂ",
                "ʈʂ",
                "cç",
                "kx",
                "tç",
                "dz",
                "d̪z̪",
                "dʒ",
                "dʑ",
                "dʐ",
                "ɖʐ",
                "ɟʝ",
                "ɡɣ",
                "dʝ",
            }
        elif self is PhoneSetType.ARPA:
            return {"JH", "CH"}
        return set()

    @property
    def fricatives(self) -> typing.Set[str]:
        """Fricatives for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "f",
                "v",
                "ç",
                "ʝ",
                "ħ",
                "ɧ",
                "θ",
                "ð",
                "ʁ",
                "ʢ",
                "ʕ",
                "χ",
                "ʜ",
                "ʢ",
                "ɦ",
                "h",
                "ɸ",
            }
        elif self is PhoneSetType.ARPA:
            return {
                "V",
                "DH",
                "HH",
                "F",
                "TH",
            }
        return set()

    @property
    def laterals(self) -> typing.Set[str]:
        """Laterals for the phone set"""
        if self is PhoneSetType.IPA:
            return {"l", "ɫ", "ʟ", "ʎ", "l̪"}
        elif self is PhoneSetType.ARPA:
            return {"L"}
        return set()

    @property
    def nasals(self) -> typing.Set[str]:
        """Nasals for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɲ", "ŋ", "m", "n", "ɳ", "ɴ", "ɱ", "ŋm", "n̪"}
        elif self is PhoneSetType.ARPA:
            return {"M", "N", "NG"}
        return set()

    @property
    def trills(self) -> typing.Set[str]:
        """Trills for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ʙ", "r", "ʀ", "r̝"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def taps(self) -> typing.Set[str]:
        """Taps for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɾ", "ɽ", "ⱱ"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def lateral_taps(self) -> typing.Set[str]:
        """Lateral taps for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɭ", "ɺ"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def lateral_fricatives(self) -> typing.Set[str]:
        """Lateral fricatives for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɬ", "ɮ"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def approximants(self) -> typing.Set[str]:
        """Approximants for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɹ", "ɻ", "ʋ", "ʍ"} | self.glides
        elif self is PhoneSetType.ARPA:
            return {"R"} | self.glides
        return set()

    @property
    def glides(self) -> typing.Set[str]:
        """Glides for the phone set"""
        if self is PhoneSetType.IPA:
            return {"j", "w", "w̃", "j̃", "ɥ", "ɰ", "ɥ̃", "ɰ̃", "j̰"}
        elif self is PhoneSetType.ARPA:
            return {"Y", "W"}
        return set()

    @property
    def nasal_approximants(self) -> typing.Set[str]:
        """Nasal approximants for the phone set"""
        if self is PhoneSetType.IPA:
            return {"w̃", "j̃", "ɥ̃", "ɰ̃"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def labials(self) -> typing.Set[str]:
        """Labials for the phone set"""
        if self is PhoneSetType.IPA:
            return {"b", "p", "m", "ɸ", "β", "ɓ", "w", "ʍ"}
        elif self is PhoneSetType.ARPA:
            return {"B", "P", "M", "W"}
        return set()

    @property
    def labiodental(self) -> typing.Set[str]:
        """Labiodentals for the phone set"""
        if self is PhoneSetType.IPA:
            return {"f", "v", "ʋ", "ⱱ", "ɱ", "pf"}
        elif self is PhoneSetType.ARPA:
            return {"F", "V"}
        return set()

    @property
    def dental(self) -> typing.Set[str]:
        """Dentals for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ð", "θ", "t̪", "d̪", "s̪", "z̪", "t̪s̪", "d̪z̪", "n̪", "l̪", "ɗ̪"}
        elif self is PhoneSetType.ARPA:
            return {"DH", "TH"}
        return set()

    @property
    def alveolar(self) -> typing.Set[str]:
        """Alveolars for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "t",
                "d",
                "s",
                "z",
                "n",
                "r",
                "l",
                "ɹ",
                "ɾ",
                "ɬ",
                "ɮ",
                "ɫ",
                "ts",
                "dz",
                "ɗ",
                "ɺ",
            }
        elif self is PhoneSetType.ARPA:
            return {"T", "D", "S", "Z", "N", "R", "L"}
        return set()

    @property
    def retroflex(self) -> typing.Set[str]:
        """Retroflexes for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ʈ", "ʂ", "ʐ", "ɖ", "ɽ", "ɻ", "ɭ", "ɳ", "ʈʂ", "ɖʐ", "ᶑ"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def alveopalatal(self) -> typing.Set[str]:
        """Alveopalatals for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ʒ", "ʃ", "dʒ", "tʃ"}
        elif self is PhoneSetType.ARPA:
            return {"ZH", "SH", "JH", "CH"}
        return set()

    @property
    def palatalized(self) -> typing.Set[str]:
        """Palatalized phones for the phone set"""
        if self is PhoneSetType.IPA:
            palatals = set()
            palatals.update(x + "ʲ" for x in self.labials)
            palatals.update(x + "ʲ" for x in self.labiodental)
            palatals.update(x + "ʲ" for x in self.dental)
            palatals.update(x + "ʲ" for x in self.alveolar)
            palatals.update(x + "ʲ" for x in self.retroflex)
            palatals.update(x + "ʲ" for x in self.palatal)
            palatals.update(x + "ʲ" for x in self.velar)
            palatals.update(x + "ʲ" for x in self.uvular)
            palatals.update(x + "ʲ" for x in self.pharyngeal)
            palatals.update(x + "ʲ" for x in self.epiglottal)
            palatals.update(x + "ʲ" for x in self.glottal)
            return palatals
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def labialized(self) -> typing.Set[str]:
        """Labialized phones for the phone set"""
        if self is PhoneSetType.IPA:
            palatals = set()
            palatals.update(x + "ʷ" for x in self.labials)
            palatals.update(x + "ʷ" for x in self.labiodental)
            palatals.update(x + "ʷ" for x in self.dental)
            palatals.update(x + "ʷ" for x in self.alveolar)
            palatals.update(x + "ʷ" for x in self.retroflex)
            palatals.update(x + "ʷ" for x in self.palatal)
            palatals.update(x + "ʷ" for x in self.velar)
            palatals.update(x + "ʷ" for x in self.uvular)
            palatals.update(x + "ʷ" for x in self.pharyngeal)
            palatals.update(x + "ʷ" for x in self.epiglottal)
            palatals.update(x + "ʷ" for x in self.glottal)
            return palatals
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def palatal(self) -> typing.Set[str]:
        """Palatal phones for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ç", "c", "ɕ", "tɕ", "ɟ", "ɟʝ", "ʝ", "ɲ", "ɥ", "j", "ʎ", "ʑ", "dʑ"}
        elif self is PhoneSetType.ARPA:
            return {"Y"}
        return set()

    @property
    def velar(self) -> typing.Set[str]:
        """Velar phones for the phone set"""
        if self is PhoneSetType.IPA:
            return {"k", "x", "ɡ", "ɠ", "ɣ", "ɰ", "ŋ"}
        elif self is PhoneSetType.ARPA:
            return {"K", "NG", "G"}
        return set()

    @property
    def uvular(self) -> typing.Set[str]:
        """Uvular phones for the phone set"""
        if self is PhoneSetType.IPA:
            return {"q", "ɢ", "ʛ", "χ", "ʀ", "ʁ", "ʟ", "ɴ"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def pharyngeal(self) -> typing.Set[str]:
        """Pharyngeal phones for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ʕ", "ħ"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def epiglottal(self) -> typing.Set[str]:
        """Epiglottal phones for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ʡ", "ʢ", "ʜ"}
        elif self is PhoneSetType.ARPA:
            return set()
        return set()

    @property
    def glottal(self) -> typing.Set[str]:
        """Glottal phones for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ʔ", "ɦ", "h"}
        elif self is PhoneSetType.ARPA:
            return {"HH"}
        return set()

    @property
    def close_vowels(self) -> typing.Set[str]:
        """Close vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɪ", "ɨ", "ɪ̈", "ʉ", "ʊ", "i", "ĩ", "ɯ", "y", "u", "ʏ", "ũ"}
        elif self is PhoneSetType.ARPA:
            return {"IH", "UH", "IY", "UW"}
        return set()

    @property
    def close_mid_vowels(self) -> typing.Set[str]:
        """Close-mid vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {"e", "ẽ", "ej", "eɪ", "o", "õ", "ow", "oʊ", "ɤ", "ø", "ɵ", "ɘ", "ə", "ɚ", "ʏ̈"}
        elif self is PhoneSetType.ARPA:
            return {"EY", "OW", "AH"}
        return set()

    @property
    def open_mid_vowels(self) -> typing.Set[str]:
        """Open-mid vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɛ", "ɜ", "ɞ", "œ", "ɔ", "ʌ", "ɐ", "æ", "ɛ̈", "ɔ̈", "ɝ"}
        elif self is PhoneSetType.ARPA:
            return {"EH", "AE", "ER"}
        return set()

    @property
    def open_vowels(self) -> typing.Set[str]:
        """Open vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {"a", "ã", "ɶ", "ɒ", "ɑ"}
        elif self is PhoneSetType.ARPA:
            return {"AO", "AA"}
        return set()

    @property
    def front_vowels(self) -> typing.Set[str]:
        """Front vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "i",
                "ĩ",
                "y",
                "ɪ",
                "ʏ",
                "e",
                "ẽ",
                "ɪ",
                "ʏ",
                "ɛ̈",
                "ʏ̈",
                "ej",
                "eɪ",
                "ø",
                "ɛ",
                "œ",
                "æ",
                "ɶ",
            }
        elif self is PhoneSetType.ARPA:
            return {"IY", "EY", "EH", "AE", "IH"}
        return set()

    @property
    def central_vowels(self) -> typing.Set[str]:
        """Central vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɨ", "ʉ", "ɘ", "ɵ", "ə", "ɜ", "ɞ", "ɐ", "ɚ", "ã", "a", "ɝ"}
        elif self is PhoneSetType.ARPA:
            return {"UW", "AH", "ER"}
        return set()

    @property
    def back_vowels(self) -> typing.Set[str]:
        """Back vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {"ɯ", "u", "ũ", "ʊ", "ɔ̈", "ɤ", "o", "õ", "ow", "oʊ", "ʌ", "ɔ", "ɑ", "ɒ"}
        elif self is PhoneSetType.ARPA:
            return {"OW", "AO", "AA", "UH"}
        return set()

    @property
    def rounded_vowels(self) -> typing.Set[str]:
        """Rounded vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "y",
                "ʏ",
                "o",
                "õ",
                "u",
                "ʊ",
                "ow",
                "oʊ",
                "ɔ",
                "ø",
                "ɵ",
                "ɞ",
                "œ",
                "ɒ",
                "ɶ",
                "ʉ",
                "ʏ̈",
                "ɔ̈",
                "ũ",
            }
        elif self is PhoneSetType.ARPA:
            return {"OW", "UW", "UH", "AO"}
        return set()

    @property
    def unrounded_vowels(self) -> typing.Set[str]:
        """Unrounded vowels for the phone set"""
        if self is PhoneSetType.IPA:
            return {
                "i",
                "ĩ",
                "e",
                "ɛ̈",
                "ej",
                "ẽ",
                "ɤ",
                "eɪ",
                "ɨ",
                "ɯ",
                "ɘ",
                "ə",
                "ɚ",
                "ɪ",
                "ɪ̈",
                "ɛ",
                "ɜ",
                "ɝ",
                "ʌ",
                "ɐ",
                "ɑ",
                "æ",
                "ã",
                "a",
            }
        elif self is PhoneSetType.ARPA:
            return {"IY", "EY", "EH", "AH", "IH", "ER", "AE", "AA"}
        return set()

    @property
    def diphthong_phones(self) -> typing.Set[str]:
        """Diphthong phones for the phone set type (these will have 5 states in HMM topologies)"""
        if self is PhoneSetType.ARPA:
            return {
                "AY",
                "AY0",
                "AY1",
                "AY2",
                "AW",
                "AW0",
                "AW1",
                "AW2",
                "OY",
                "OY0",
                "OY1",
                "OY2",
            }
        if self is PhoneSetType.IPA or self is PhoneSetType.PINYIN:
            diphthongs = {x + y for x, y in itertools.product(self.vowels, self.vowels)}
            if self is PhoneSetType.IPA:
                diphthongs |= {x + y for x, y in itertools.product(self.glides, self.vowels)}
                diphthongs |= {x + y for x, y in itertools.product(self.vowels, self.glides)}

            return diphthongs

        return set()

    @property
    def vowels(self) -> typing.Set[str]:
        """Vowels for the phone set type"""
        if self is PhoneSetType.PINYIN:
            return {"i", "u", "y", "e", "w", "a", "o", "e", "ü"}
        elif self is PhoneSetType.ARPA:
            return {"IH", "UH", "IY", "AE", "UW", "AH", "AO", "AA"}
        elif self is PhoneSetType.IPA:
            base_vowels = {
                "i",
                "u",
                "e",
                "ə",
                "a",
                "o",
                "y",
                "ɔ",
                "j",
                "w",
                "ɪ",
                "ʊ",
                "w",
                "ʏ",
                "ɯ",
                "ɤ",
                "ɑ",
                "æ",
                "ɐ",
                "ɚ",
                "ɵ",
                "ɘ",
                "ɛ",
                "ɜ",
                "ɝ",
                "ɞ",
                "ɑ̃",
                "ɨ",
                "ɪ̈",
                "œ",
                "ɒ",
                "ɶ",
                "ø",
                "ʉ",
                "ʌ",
            }
            base_vowels |= {x + "̃" for x in base_vowels}  # Add nasals
            return {
                "i",
                "u",
                "e",
                "ə",
                "a",
                "o",
                "y",
                "ɔ",
                "j",
                "w",
                "ɪ",
                "ʊ",
                "w",
                "ʏ",
                "ɯ",
                "ɤ",
                "ɑ",
                "æ",
                "ɐ",
                "ɚ",
                "ɵ",
                "ɘ",
                "ɛ",
                "ɜ",
                "ɝ",
                "ɞ",
                "ɨ",
                "ɪ̈",
                "œ",
                "ɒ",
                "ɶ",
                "ø",
                "ʉ",
                "ʌ",
            }
        return set()

    @property
    def triphthong_phones(self) -> typing.Set[str]:
        """Triphthong phones for the phone set type"""
        if self is PhoneSetType.IPA or self is PhoneSetType.PINYIN:
            triphthongs = {
                x + y + z for x, y, z in itertools.product(self.vowels, self.vowels, self.vowels)
            }
            if self is PhoneSetType.IPA:
                triphthongs |= {
                    x + y for x, y in itertools.product(self.glides, self.diphthong_phones)
                }
                triphthongs |= {
                    x + y for x, y in itertools.product(self.diphthong_phones, self.glides)
                }
            return triphthongs
        return set()

    @property
    def extra_questions(self) -> typing.Dict[str, typing.Set[str]]:
        """Extra questions for phone clustering in triphone models"""
        extra_questions = {}
        if self is PhoneSetType.ARPA:
            extra_questions["stops"] = self.stops
            extra_questions["fricatives"] = self.fricatives
            extra_questions["sibilants"] = self.sibilants | self.affricates
            extra_questions["approximants"] = self.approximants
            extra_questions["laterals"] = self.laterals
            extra_questions["nasals"] = self.nasals
            extra_questions["labials"] = self.labials | self.labiodental
            extra_questions["dental"] = self.dental | self.labiodental
            extra_questions["coronal"] = self.dental | self.alveolar | self.alveopalatal
            extra_questions["dorsal"] = self.velar | self.glottal

            extra_questions["unrounded"] = self.unrounded_vowels
            extra_questions["rounded"] = self.rounded_vowels
            extra_questions["front"] = self.front_vowels
            extra_questions["central"] = self.central_vowels
            extra_questions["back"] = self.back_vowels
            extra_questions["close"] = self.close_vowels
            extra_questions["close_mid"] = self.close_mid_vowels
            extra_questions["open_mid"] = self.open_mid_vowels
            extra_questions["open"] = self.open_vowels

            # extra stress questions
            vowels = [
                "AA",
                "AE",
                "AH",
                "AO",
                "AW",
                "AY",
                "EH",
                "ER",
                "EY",
                "IH",
                "IY",
                "OW",
                "OY",
                "UH",
                "UW",
            ]
            for i in range(3):
                extra_questions[f"stress_{i}"] = {f"{x}{i}" for x in vowels}
        elif self is PhoneSetType.PINYIN:
            for i in range(1, 6):
                extra_questions[f"tone_{i}"] = {f"{x}{i}" for x in self.vowels}
                extra_questions[f"tone_{i}"] |= {f"{x}{i}" for x in self.diphthong_phones}
                extra_questions[f"tone_{i}"] |= {f"{x}{i}" for x in self.triphthong_phones}
            extra_questions["bilabial_variation"] = {"p", "b"}
            extra_questions["nasal_variation"] = {"m", "n", "ng"}
            extra_questions["voiceless_sibilant_variation"] = {
                "z",
                "zh",
                "j",
                "c",
                "ch",
                "q",
                "s",
                "sh",
                "x",
            }
            extra_questions["dorsal_variation"] = {"h", "k", "g"}
            extra_questions["alveolar_stop_variation"] = {"t", "d"}
            extra_questions["approximant_variation"] = {"l", "r", "y", "w"}
            extra_questions["rhotic_variation"] = {"r", "sh", "e"}

        elif self is PhoneSetType.IPA:

            def add_consonant_variants(consonant_set):
                """Add consonant variants for the given set"""
                consonants = set()
                for p in consonant_set:
                    if p in self.voiceless_obstruents:
                        consonants |= voiceless_variants(p)
                    else:
                        consonants |= voiced_variants(p)
                return consonants

            extra_questions["stops"] = add_consonant_variants(self.stops)
            extra_questions["fricatives"] = add_consonant_variants(
                self.fricatives | self.lateral_fricatives
            )
            extra_questions["sibilants"] = add_consonant_variants(self.sibilants | self.affricates)
            extra_questions["approximants"] = add_consonant_variants(self.approximants)
            extra_questions["laterals"] = add_consonant_variants(self.laterals)
            extra_questions["nasals"] = add_consonant_variants(
                self.nasals | self.nasal_approximants
            )
            extra_questions["trills"] = add_consonant_variants(self.trills | self.taps)
            extra_questions["labials"] = add_consonant_variants(
                self.labials | self.labiodental | self.labialized
            )
            extra_questions["dental"] = add_consonant_variants(self.dental | self.labiodental)
            extra_questions["coronal"] = add_consonant_variants(
                self.dental | self.alveolar | self.retroflex | self.alveopalatal
            )
            extra_questions["dorsal"] = add_consonant_variants(
                self.palatal | self.velar | self.uvular
            )
            extra_questions["palatals"] = add_consonant_variants(
                self.palatal | self.alveopalatal | self.palatalized
            )
            extra_questions["pharyngeal"] = add_consonant_variants(
                self.pharyngeal | self.epiglottal | self.glottal
            )

            extra_questions["unrounded"] = add_consonant_variants(self.unrounded_vowels)
            extra_questions["rounded"] = add_consonant_variants(self.rounded_vowels)
            extra_questions["front"] = add_consonant_variants(self.front_vowels)
            extra_questions["central"] = add_consonant_variants(self.central_vowels)
            extra_questions["back"] = add_consonant_variants(self.back_vowels)
            extra_questions["close"] = add_consonant_variants(self.close_vowels)
            extra_questions["close_mid"] = add_consonant_variants(self.close_mid_vowels)
            extra_questions["open_mid"] = add_consonant_variants(self.open_mid_vowels)
            extra_questions["open"] = add_consonant_variants(self.open_vowels)

            extra_questions["front_semi_vowels"] = add_consonant_variants(
                {"j", "i", "ɪ", "ɥ", "ʏ", "y"}
            )
            extra_questions["back_semi_vowels"] = add_consonant_variants(
                {"w", "u", "ʊ", "ɰ", "ɯ", "ʍ"}
            )
            # Some language specific questions
            extra_questions["L_vocalization"] = {"ʊ", "ɫ", "u", "ʉ"}
            extra_questions["ts_z_variation"] = {"ts", "z"}
            extra_questions["rhotics"] = {"ɹ", "ɝ", "ɚ", "ə", "ʁ", "ɐ"}
            extra_questions["diphthongs"] = self.diphthong_phones
            extra_questions["triphthongs"] = self.triphthong_phones

        return extra_questions



# noinspection PyUnresolvedReferences

[docs]
@dataclassy.dataclass(slots=True)
class SoundFileInformation:
    """
    Data class for sound file information with format, duration, number of channels, bit depth, and
        sox_string for use in Kaldi feature extraction if necessary

    Parameters
    ----------
    format: str
        Format of the sound file
    sample_rate: int
        Sample rate
    duration: float
        Duration
    sample_rate: int
        Sample rate
    sox_string: str
        String to use for loading with sox
    """

    format: str
    sample_rate: int
    duration: float
    num_channels: int
    sox_string: str

    @property
    def meta(self) -> typing.Dict[str, typing.Any]:
        """Dictionary representation of sound file information"""
        return dataclassy.asdict(self)



# noinspection PyUnresolvedReferences
@dataclassy.dataclass(slots=True)
class FileExtensions:
    """
    Data class for information about the current directory

    Parameters
    ----------
    identifiers: list[str]
        List of identifiers
    lab_files: dict[str, str]
        Mapping of identifiers to lab files
    textgrid_files: dict[str, str]
        Mapping of identifiers to TextGrid files
    wav_files: dict[str, str]
        Mapping of identifiers to wav files
    other_audio_files: dict[str, str]
        Mapping of identifiers to other audio files
    """

    identifiers: typing.Set[str]
    lab_files: typing.Dict[str, str]
    textgrid_files: typing.Dict[str, str]
    wav_files: typing.Dict[str, str]
    other_audio_files: typing.Dict[str, str]


# noinspection PyUnresolvedReferences

[docs]
@dataclassy.dataclass(slots=True)
class WordData:
    """
    Data class for information about a word and its pronunciations

    Parameters
    ----------
    orthography: str
        Orthographic string for the word
    pronunciations: set[tuple[str, ...]
        Set of tuple pronunciations for the word
    """

    orthography: str
    pronunciations: typing.Set[typing.Tuple[str, ...]]



# noinspection PyUnresolvedReferences
@dataclassy.dataclass(slots=True)
class NgramHistoryState:
    """
    Data class for storing ngram history
    """

    backoff_prob: float = 1.0
    word_to_prob: dict = {}


class ArpaNgramModel:
    """
    Wrapper class for ngram models, taken largely from :kaldi_utils`:`lang/internal/arpa2fst_constrained.py`
    """

    def __init__(self):
        self.orders = {0: collections.defaultdict(NgramHistoryState)}

    @classmethod
    def read(cls, input: typing.Union[io.StringIO, str]):
        """
        Read an ngram model from a stream

        Parameters
        ----------
        input: :class:`io.StringIO` or str
            Input stream or file path to read

        Returns
        -------
        :class:`~montreal_forced_aligner.data.ArpaNgramModel`
            Constructed model
        """
        cleanup = False
        if isinstance(input, str):
            cleanup = True
            input = open(input, "r", encoding="utf8")
        log10 = math.log(10.0)
        current_order = -1
        model = ArpaNgramModel()
        for line in input:
            line = line.strip()
            if not line:
                continue
            m = re.match(r"\\(?P<order>[0-9]*)-grams:$", line)
            if m:
                current_order = int(m.group("order"))
                model.orders[current_order] = collections.defaultdict(NgramHistoryState)
                continue
            if current_order < 1:
                continue
            if line.startswith("\\"):
                continue
            col = line.split()
            prob = math.exp(float(col[0]) * log10)
            hist = tuple(col[1:current_order])
            word = col[current_order]  # a string
            backoff_prob = (
                math.exp(float(col[current_order + 1]) * log10)
                if len(col) == current_order + 2
                else None
            )

            model.orders[current_order - 1][hist].word_to_prob[word] = prob
            if backoff_prob is not None:
                model.orders[current_order][hist + (word,)].backoff_prob = backoff_prob
        if cleanup:
            input.close()
        return model

    def history_to_fst_state_mapping(
        self, min_order: int = None, max_order: int = None
    ) -> typing.Tuple[
        typing.Dict[typing.Tuple[str, ...], int], typing.List[typing.Tuple[str, ...]]
    ]:
        """

        This function, called from PrintAsFst, returns (hist_to_state,
        state_to_hist), which map from history (as a tuple of strings) to
        integer FST-state and vice versa.

        Parameters
        ----------
        min_order: int, optional
            Minimum order of ngrams to construct state mapping
        max_order: int, optional
            Maximum order of ngrams to construct state mapping

        Returns
        -------
        typing.Dict[typing.Tuple[str, ...], int]
            History to state mapping
        typing.List[typing.Tuple[str, ...]]
            State to history mapping
        """

        hist_to_state = {}
        state_to_hist = []

        # Make sure the initial bigram state comes first (and that
        # we have such a state even if it was completely pruned
        # away in the bigram LM.. which is unlikely of course)
        hist = ("<s>",)
        hist_to_state[hist] = len(state_to_hist)
        state_to_hist.append(hist)

        # create a bigram state for each of the 'real' words...  even if the LM
        # didn't naturally have such bigram states, we'll create them so that we
        # can enforce the bigram constraints supplied in 'bigrams_file' by the
        # user.
        for word in self.orders[0][()].word_to_prob:
            if word != "<s>" and word != "</s>":
                hist = (word,)
                hist_to_state[hist] = len(state_to_hist)
                state_to_hist.append(hist)

        # note: we do not allocate an FST state for the unigram state, because
        # we don't have a unigram state in the output FST, only bigram states; and
        # we don't iterate over bigram histories because we covered them all above;
        # that's why we start 'n' from 2 below instead of from 0.
        for order, history_states in self.orders.items():
            if min_order is not None and order < min_order:
                continue
            if max_order is not None and order > max_order:
                continue
            for hist in history_states.keys():
                # note: hist is a tuple of strings.
                assert hist not in hist_to_state
                hist_to_state[hist] = len(state_to_hist)
                state_to_hist.append(hist)

        return (hist_to_state, state_to_hist)

    def _get_prob(self, hist: typing.Tuple[str, ...], word: str) -> float:
        """
        Returns the probability of word 'word' in history-state 'hist'.
        Dies with error if this word is not predicted at all by the LM (not in vocab).
        history-state does not exist.

        Parameters
        ----------
        hist: tuple[str,...]
            History for ngram
        word: str
            Current word

        Returns
        -------
        float
            Probability
        """
        assert len(hist) < len(self.orders)
        if len(hist) == 0:
            word_to_prob = self.orders[0][()].word_to_prob
            return word_to_prob[word]
        else:
            if hist in self.orders[len(hist)]:
                hist_state = self.orders[len(hist)][hist]
                if word in hist_state.word_to_prob:
                    return hist_state.word_to_prob[word]
                else:
                    return hist_state.backoff_prob * self._get_prob(hist[1:], word)
            else:
                return self._get_prob(hist[1:], word)

    def _get_state_for_hist(self, hist_to_state, hist) -> int:
        """
        This gets the state corresponding to 'hist' in 'hist_to_state', but backs
        off for us if there is no such state.

        Parameters
        ----------
        hist_to_state: dict[tuple[str, ...], int]
            Mapping of history to states
        hist: tuple[str, ...]
            History to look up

        Returns
        -------
        int
            State for history
        """
        if hist in hist_to_state:
            return hist_to_state[hist]
        else:
            assert len(hist) > 1
            return self._get_state_for_hist(hist_to_state, hist[1:])

    def construct_bigram_fst(
        self,
        disambig_symbol: str,
        bigram_map: typing.Dict[str, typing.Set[str]],
        symbols: pywrapfst.SymbolTable,
    ) -> pynini.Fst:
        """

        This function prints the estimated language model as an FST.
        disambig_symbol will be something like '#0' (a symbol introduced
        to make the result determinizable).
        bigram_map represent the allowed bigrams (left-word, right-word): it's a map
        from left-word to a set of right-words (both are strings).

        Parameters
        ----------
        disambig_symbol: str
            Disambiguation symbol
        bigram_map: dict[str, set[str]]
            Mapping of left bigrams to allowed right bigrams
        symbols: :class:`pywrapfst.SymbolTable`
            Symbol table for the FST

        Returns
        -------
        :class:`pynini.Fst`
            Bigram FST
        """

        # History will map from history (as a tuple) to integer FST-state.
        (hist_to_state, state_to_hist) = self.history_to_fst_state_mapping(min_order=2)

        # The following 3 things are just for diagnostics.
        normalization_stats = [[0, 0.0] for _ in range(len(self.orders))]
        num_ngrams_allowed = 0
        num_ngrams_disallowed = 0

        fst = pynini.Fst()
        for state in range(len(state_to_hist)):
            s = fst.add_state()
            hist = state_to_hist[state]
            hist_len = len(hist)
            assert hist_len > 0
            if hist_len == 1:  # it's a bigram state...
                context_word = hist[0]
                if context_word not in bigram_map:
                    continue
                # word list is a list of words that can follow this word.  It must be nonempty.
                word_list = list(bigram_map[context_word])

                normalization_stats[hist_len][0] += 1

                for word in word_list:
                    prob = self._get_prob((context_word,), word)
                    assert prob != 0
                    normalization_stats[hist_len][1] += prob
                    cost = -math.log(prob)
                    if word == "</s>":
                        fst.set_final(s, pywrapfst.Weight(fst.weight_type(), cost))
                    else:
                        next_state = self._get_state_for_hist(hist_to_state, (context_word, word))
                        k = symbols.find(word)
                        fst.add_arc(state, pywrapfst.Arc(k, k, cost, next_state))
            else:  # it's a higher-order than bigram state.
                assert hist in self.orders[hist_len]
                hist_state = self.orders[hist_len][hist]
                most_recent_word = hist[-1]

                normalization_stats[hist_len][0] += 1
                normalization_stats[hist_len][1] += sum(
                    self._get_prob(hist, word) for word in bigram_map[most_recent_word]
                )

                for word, prob in hist_state.word_to_prob.items():
                    cost = -math.log(prob)
                    if word in bigram_map[most_recent_word]:
                        num_ngrams_allowed += 1
                    else:
                        num_ngrams_disallowed += 1
                        continue
                    if word == "</s>":
                        fst.set_final(s, pywrapfst.Weight(fst.weight_type(), cost))
                    else:
                        next_state = self._get_state_for_hist(hist_to_state, (hist) + (word,))
                        k = symbols.find(word)
                        fst.add_arc(state, pywrapfst.Arc(k, k, cost, next_state))

                assert hist in self.orders[hist_len]
                backoff_prob = self.orders[hist_len][hist].backoff_prob
                assert backoff_prob != 0.0
                cost = -math.log(backoff_prob)
                backoff_hist = hist[1:]
                backoff_state = self._get_state_for_hist(hist_to_state, backoff_hist)

                this_disambig_symbol = (
                    disambig_symbol if len(hist_state.word_to_prob) != 0 else "<eps>"
                )
                k = symbols.find(this_disambig_symbol)
                eps = symbols.find("<eps>")
                fst.add_arc(state, pywrapfst.Arc(k, eps, cost, backoff_state))
        fst.set_start(0)
        return fst

    def export_bigram_fst(
        self,
        output: typing.Union[str, io.StringIO],
        disambig_symbol: str,
        bigram_map: typing.Dict[str, typing.Set[str]],
    ) -> None:
        """

        This function prints the estimated language model as an FST.
        disambig_symbol will be something like '#0' (a symbol introduced
        to make the result determinizable).
        bigram_map represent the allowed bigrams (left-word, right-word): it's a map
        from left-word to a set of right-words (both are strings).

        Parameters
        ----------
        output: :class:`io.StringIO` or str
            Output stream or file name to export to
        disambig_symbol: str
            Disambiguation symbol to use
        bigram_map: dict[str, set[str]]
            Mapping of left bigrams to allowed right bigrams

        """

        # History will map from history (as a tuple) to integer FST-state.
        (hist_to_state, state_to_hist) = self.history_to_fst_state_mapping(min_order=2)

        # The following 3 things are just for diagnostics.
        normalization_stats = [[0, 0.0] for _ in range(len(self.orders))]
        num_ngrams_allowed = 0
        num_ngrams_disallowed = 0

        if isinstance(output, str):
            output = open(output, "w", encoding="utf8")
        for state in range(len(state_to_hist)):
            hist = state_to_hist[state]
            hist_len = len(hist)
            assert hist_len > 0
            if hist_len == 1:  # it's a bigram state...
                context_word = hist[0]
                if context_word not in bigram_map:
                    continue
                # word list is a list of words that can follow this word.  It must be nonempty.
                word_list = list(bigram_map[context_word])

                normalization_stats[hist_len][0] += 1

                for word in word_list:
                    prob = self._get_prob((context_word,), word)
                    assert prob != 0
                    normalization_stats[hist_len][1] += prob
                    cost = -math.log(prob)
                    if word == "</s>":
                        output.write(f"{state} {cost:.3f}\n")
                    else:
                        next_state = self._get_state_for_hist(hist_to_state, (context_word, word))
                        output.write(f"{state} {next_state} {word} {word} {cost:.3f}\n")
            else:  # it's a higher-order than bigram state.
                assert hist in self.orders[hist_len]
                hist_state = self.orders[hist_len][hist]
                most_recent_word = hist[-1]

                normalization_stats[hist_len][0] += 1
                normalization_stats[hist_len][1] += sum(
                    self._get_prob(hist, word) for word in bigram_map[most_recent_word]
                )

                for word, prob in hist_state.word_to_prob.items():
                    cost = -math.log(prob)
                    if word in bigram_map[most_recent_word]:
                        num_ngrams_allowed += 1
                    else:
                        num_ngrams_disallowed += 1
                        continue
                    if word == "</s>":
                        output.write(f"{state} {cost:.3f}\n")
                    else:
                        next_state = self._get_state_for_hist(hist_to_state, (hist) + (word,))
                        output.write(f"{state} {next_state} {word} {word} {cost:.3f}\n")

                assert hist in self.orders[hist_len]
                backoff_prob = self.orders[hist_len][hist].backoff_prob
                assert backoff_prob != 0.0
                cost = -math.log(backoff_prob)
                backoff_hist = hist[1:]
                backoff_state = self._get_state_for_hist(hist_to_state, backoff_hist)

                this_disambig_symbol = (
                    disambig_symbol if len(hist_state.word_to_prob) != 0 else "<eps>"
                )
                output.write(f"{state} {backoff_state} {this_disambig_symbol} <eps> {cost:.3f}")
        output.close()


# noinspection PyUnresolvedReferences

[docs]
@dataclassy.dataclass(slots=True)
class PronunciationProbabilityCounter:
    """
    Data class for count information used in pronunciation probability modeling

    Parameters
    ----------
    ngram_counts: collections.defaultdict
        Counts of ngrams
    word_pronunciation_counts: collections.defaultdict
        Counts of word pronunciations
    silence_following_counts: collections.Counter
        Counts of silence following pronunciation
    non_silence_following_counts: collections.Counter
        Counts of non-silence following pronunciation
    silence_before_counts: collections.Counter
        Counts of silence before pronunciation
    non_silence_before_counts: collections.Counter
        Counts of non-silence before pronunciation
    """

    ngram_counts: collections.defaultdict = dataclassy.factory(collections.defaultdict)
    word_pronunciation_counts: collections.defaultdict = dataclassy.factory(
        collections.defaultdict
    )
    silence_following_counts: collections.Counter = dataclassy.factory(collections.Counter)
    non_silence_following_counts: collections.Counter = dataclassy.factory(collections.Counter)
    silence_before_counts: collections.Counter = dataclassy.factory(collections.Counter)
    non_silence_before_counts: collections.Counter = dataclassy.factory(collections.Counter)

    def __post_init__(self) -> None:
        """Initialize default dictionaries"""
        self.ngram_counts = collections.defaultdict(collections.Counter)
        self.word_pronunciation_counts = collections.defaultdict(collections.Counter)


[docs]
    def add_counts(self, other_counter: PronunciationProbabilityCounter) -> None:
        """
        Combine counts of two :class:`~montreal_forced_aligner.data.PronunciationProbabilityCounter`

        Parameters
        ----------
        other_counter: :class:`~montreal_forced_aligner.data.PronunciationProbabilityCounter`
            Other object with pronunciation probability counts
        """
        for k, v in other_counter.ngram_counts.items():
            self.ngram_counts[k]["silence"] += v["silence"]
            self.ngram_counts[k]["non_silence"] += v["non_silence"]
        for k, v in other_counter.word_pronunciation_counts.items():
            for k2, v2 in v.items():
                self.word_pronunciation_counts[k][k2] += v2
        self.silence_following_counts.update(other_counter.silence_following_counts)
        self.non_silence_following_counts.update(other_counter.non_silence_following_counts)
        self.silence_before_counts.update(other_counter.silence_before_counts)
        self.non_silence_before_counts.update(other_counter.non_silence_before_counts)




# noinspection PyUnresolvedReferences

[docs]
@dataclassy.dataclass(slots=True)
class CtmInterval:
    """
    Data class for intervals derived from CTM files

    Parameters
    ----------
    begin: float
        Start time of interval
    end: float
        End time of interval
    label: str
        Text of interval
    confidence: float, optional
        Confidence score of the interval
    """

    begin: float
    end: float
    label: typing.Union[int, str]
    confidence: typing.Optional[float] = None

    def __lt__(self, other: CtmInterval):
        """Sorting function for CtmIntervals"""
        return self.begin < other.begin

    def __add__(self, other):
        if isinstance(other, str):
            return self.label + other
        else:
            self.begin += other
            self.end += other

    def __post_init__(self) -> None:
        """
        Check on data validity

        Raises
        ------
        :class:`~montreal_forced_aligner.exceptions.CtmError`
            If begin or end are not valid
        """
        if self.end < -1 or self.begin == 1000000:
            raise CtmError(self)


[docs]
    def to_tg_interval(self, file_duration=None) -> Interval:
        """
        Converts the CTMInterval to
        `PraatIO's Interval class <http://timmahrt.github.io/praatIO/praatio/utilities/constants.html#Interval>`_

        Returns
        -------
        :class:`praatio.utilities.constants.Interval`
            Derived PraatIO Interval
        """
        if self.end < -1 or self.begin == 1000000:
            raise CtmError(self)
        end = round(self.end, 6)
        begin = round(self.begin, 6)
        if file_duration is not None and end > file_duration:
            end = round(file_duration, 6)
        if begin >= end:
            raise CtmError(self)
        return Interval(round(self.begin, 6), end, self.label)




# noinspection PyUnresolvedReferences
@dataclassy.dataclass(slots=True)
class WordCtmInterval:
    """
    Data class for word intervals derived from CTM files

    Parameters
    ----------
    begin: float
        Start time of interval
    end: float
        End time of interval
    word_id: int
        Integer id of word
    pronunciation_id: int
        Pronunciation integer id of word
    """

    begin: float
    end: float
    word_id: int
    pronunciation_id: int

    def __lt__(self, other: WordCtmInterval):
        """Sorting function for WordCtmIntervals"""
        return self.begin < other.begin