Source code for aligner.dictionary

import os
import math
import subprocess
import re
from collections import defaultdict, Counter

from .helper import thirdparty_binary
from .exceptions import DictionaryPathError, DictionaryFileError, DictionaryError


def compile_graphemes(graphemes):
    if '-' in graphemes:
        base = r'^\W*([-{}]+)\W*'
    else:
        base = r'^\W*([{}]+)\W*'
    string = ''.join(x for x in graphemes if x != '-')
    try:
        return re.compile(base.format(string))
    except Exception:
        print(graphemes)
        raise


def sanitize(item):
    # Clitic markers are "-" and "'"
    sanitized = re.sub(r"^[^-\w']+", '', item)
    sanitized = re.sub(r"[^-\w']+$", '', sanitized)
    return sanitized


def sanitize_clitics(item):
    # Clitic markers are "-" and "'"
    sanitized = re.sub(r"^\W+", '', item)
    sanitized = re.sub(r"\W+$", '', sanitized)
    return sanitized


[docs]class Dictionary(object):
    """
    Class containing information about a pronunciation dictionary

    Parameters
    ----------
    input_path : str
        Path to an input pronunciation dictionary
    output_directory : str
        Path to a directory to store files for Kaldi
    oov_code : str, optional
        What to label words not in the dictionary, defaults to ``'<unk>'``
    position_dependent_phones : bool, optional
        Specifies whether phones should be represented as dependent on their
        position in the word (beginning, middle or end), defaults to True
    num_sil_states : int, optional
        Number of states to use for silence phones, defaults to 5
    num_nonsil_states : int, optional
        Number of states to use for non-silence phones, defaults to 3
    shared_silence_phones : bool, optional
        Specify whether to share states across all silence phones, defaults
        to True
    pronunciation probabilities : bool, optional
        Specifies whether to model different pronunciation probabilities
        or to treat each entry as a separate word, defaults to True
    sil_prob : float, optional
        Probability of optional silences following words, defaults to 0.5
    """

    topo_template = '<State> {cur_state} <PdfClass> {cur_state} <Transition> {cur_state} 0.75 <Transition> {next_state} 0.25 </State>'
    topo_sil_template = '<State> {cur_state} <PdfClass> {cur_state} {transitions} </State>'
    topo_transition_template = '<Transition> {} {}'
    positions = ["_B", "_E", "_I", "_S"]
    clitic_markers = ["'", '-']

    def __init__(self, input_path, output_directory, oov_code='<unk>',
                 position_dependent_phones=True, num_sil_states=5,
                 num_nonsil_states=3, shared_silence_phones=True,
                 sil_prob=0.5, word_set=None, debug=False):
        if not os.path.exists(input_path):
            raise (DictionaryPathError(input_path))
        if not os.path.isfile(input_path):
            raise (DictionaryFileError(input_path))
        self.input_path = input_path
        self.debug = debug
        self.output_directory = os.path.join(output_directory, 'dictionary')
        self.num_sil_states = num_sil_states
        self.num_nonsil_states = num_nonsil_states
        self.shared_silence_phones = shared_silence_phones
        self.sil_prob = sil_prob
        self.oov_code = oov_code
        self.position_dependent_phones = position_dependent_phones

        self.words = defaultdict(list)
        self.nonsil_phones = set()
        self.sil_phones = {'sp', 'spn', 'sil'}
        self.optional_silence = 'sp'
        self.nonoptional_silence = 'sil'
        self.graphemes = set()
        if word_set is not None:
            word_set = {sanitize(x) for x in word_set}
        self.words['!sil'].append((('sp',), 1))
        self.words[self.oov_code].append((('spn',), 1))
        self.pronunciation_probabilities = True
        with open(input_path, 'r', encoding='utf8') as inf:
            for i, line in enumerate(inf):
                line = line.strip()
                if not line:
                    continue
                line = line.split()
                word = line.pop(0).lower()
                if not line:
                    raise DictionaryError('Line {} of {} does not have a pronunciation.'.format(i, input_path))
                if word in ['!sil', oov_code]:
                    continue
                if word_set is not None and sanitize(word) not in word_set:
                    continue
                self.graphemes.update(word)
                try:
                    prob = float(line[0])
                    line = line[1:]
                except ValueError:
                    prob = None
                    self.pronunciation_probabilities = False
                pron = tuple(line)
                if not any(x in self.sil_phones for x in pron):
                    self.nonsil_phones.update(pron)
                if word in self.words and pron in set(x[0] for x in self.words[word]):
                    continue
                self.words[word].append((pron, prob))
        self.word_pattern = compile_graphemes(self.graphemes)
        self.phone_mapping = {}
        self.words_mapping = {}

    def generate_mappings(self):
        self.phone_mapping = {}
        i = 0
        self.phone_mapping['<eps>'] = i
        if self.position_dependent_phones:
            for p in self.positional_sil_phones:
                i += 1
                self.phone_mapping[p] = i
            for p in self.positional_nonsil_phones:
                i += 1
                self.phone_mapping[p] = i
        else:
            for p in sorted(self.sil_phones):
                i += 1
                self.phone_mapping[p] = i
            for p in sorted(self.nonsil_phones):
                i += 1
                self.phone_mapping[p] = i

        self.words_mapping = {}
        i = 0
        self.words_mapping['<eps>'] = i
        for w in sorted(self.words.keys()):
            i += 1
            self.words_mapping[w] = i

        self.words_mapping['#0'] = i + 1
        self.words_mapping['<s>'] = i + 2
        self.words_mapping['</s>'] = i + 3

        self.oovs_found = set()
        self.add_disambiguation()

    def add_disambiguation(self):
        subsequences = set()
        pronunciation_counts = defaultdict(int)

        for w, prons in self.words.items():
            for p in prons:
                pronunciation_counts[p[0]] += 1
                pron = [x for x in p[0]][:-1]
                while pron:
                    subsequences.add(tuple(p))
                    pron = pron[:-1]
        last_used = defaultdict(int)
        for w, prons in sorted(self.words.items()):
            new_prons = []
            for p in prons:
                if pronunciation_counts[p[0]] == 1 and not p[0] in subsequences:
                    disambig = None
                else:
                    pron = p[0]
                    last_used[pron] += 1
                    disambig = last_used[pron]
                new_prons.append((p[0], p[1], disambig))
            self.words[w] = new_prons
        if last_used:
            self.max_disambig = max(last_used.values())
        else:
            self.max_disambig = 0
        self.disambig = set('#{}'.format(x) for x in range(self.max_disambig + 1))
        i = max(self.phone_mapping.values())
        for p in sorted(self.disambig):
            i += 1
            self.phone_mapping[p] = i

    def create_utterance_fst(self, text, frequent_words):
        num_words = len(text)
        word_probs = Counter(text)
        word_probs = {k: v / num_words for k, v in word_probs.items()}
        word_probs.update(frequent_words)
        text = ''
        for k, v in word_probs.items():
            cost = -1 * math.log(v)
            text += '0 0 {w} {w} {cost}\n'.format(w=self.to_int(k), cost=cost)
        text += '0 {}\n'.format(-1 * math.log(1 / num_words))
        return text

    def to_int(self, item):
        """
        Convert a given word into its integer id
        """
        if item == '':
            return None
        item = self._lookup(item)
        if item not in self.words_mapping:
            self.oovs_found.add(item)
            return self.oov_int
        return self.words_mapping[item]

    def save_oovs_found(self, directory):
        """
        Save all out of vocabulary items to a file in the specified directory

        Parameters
        ----------
        directory : str
            Path to directory to save ``oovs_found.txt``
        """
        with open(os.path.join(directory, 'oovs_found.txt'), 'w', encoding='utf8') as f:
            for oov in sorted(self.oovs_found):
                f.write(oov + '\n')
        self.oovs_found = set()

    def _lookup(self, item):
        if item in self.words_mapping:
            return item
        sanitized = sanitize(item)
        if sanitized in self.words_mapping:
            return sanitized
        sanitized = sanitize_clitics(item)
        if sanitized in self.words_mapping:
            return sanitized
        return item

    def separate_clitics(self, item):
        """Separates words with apostrophes or hyphens if the subparts are in the lexicon.

        Checks whether the text on either side of an apostrophe or hyphen is in the dictionary. If so,
        splits the word. If neither part is in the dictionary, returns the word without splitting it.

        Parameters
        ----------
        item : string
            Lexical item

        Returns
        -------
        vocab_items: list
            List containing all words after any splits due to apostrophes or hyphens

        """
        unit_re = re.compile(r'^(\[.*\]|\{.*\}|<.*>)$')
        if unit_re.match(item) is not None:
            return [item]
        lookup = self._lookup(item)

        if lookup not in self.words_mapping:
            item = sanitize(item)
            vocab = []
            chars = list(item)
            count = 0
            for i in chars:
                if i in self.clitic_markers:
                    count += 1
            for i in range(count):
                for punc in chars:
                    if punc in self.clitic_markers:
                        idx = chars.index(punc)
                        option1withpunc = ''.join(chars[:idx + 1])
                        option1nopunc = ''.join(chars[:idx])
                        option2withpunc = ''.join(chars[idx:])
                        option2nopunc = ''.join(chars[idx + 1:])
                        if option1withpunc in self.words:
                            vocab.append(option1withpunc)
                            if option2nopunc in self.words:
                                vocab.append(option2nopunc)
                            elif all(x not in list(option2nopunc) for x in self.clitic_markers):
                                vocab.append(option2nopunc)
                        else:
                            vocab.append(option1nopunc)
                            if option2withpunc in self.words:
                                vocab.append(option2withpunc)
                            elif option2nopunc in self.words:
                                vocab.append(option2nopunc)
                            elif all(x not in list(option2nopunc) for x in self.clitic_markers):
                                vocab.append(option2nopunc)
                        chars = list(option2nopunc)
        else:
            return [lookup]
        if not vocab:
            return [lookup]
        else:
            unk = []
            for i in vocab:
                if i not in self.words:
                    unk.append(i)
            if len(unk) == count + 1:
                return [lookup]
            return vocab

    @property
    def reversed_word_mapping(self):
        """
        A mapping of integer ids to words
        """
        mapping = {}
        for k, v in self.words_mapping.items():
            mapping[v] = k
        return mapping

    @property
    def reversed_phone_mapping(self):
        """
        A mapping of integer ids to phones
        """
        mapping = {}
        for k, v in self.phone_mapping.items():
            mapping[v] = k
        return mapping

    @property
    def oov_int(self):
        """
        The integer id for out of vocabulary items
        """
        return self.words_mapping[self.oov_code]

    @property
    def positional_sil_phones(self):
        """
        List of silence phones with positions
        """
        sil_phones = []
        for p in sorted(self.sil_phones):
            sil_phones.append(p)
            for pos in self.positions:
                sil_phones.append(p + pos)
        return sil_phones

    @property
    def positional_nonsil_phones(self):
        """
        List of non-silence phones with positions
        """
        nonsil_phones = []
        for p in sorted(self.nonsil_phones):
            for pos in self.positions:
                nonsil_phones.append(p + pos)
        return nonsil_phones

    @property
    def optional_silence_csl(self):
        """
        Phone id of the optional silence phone
        """
        return '{}'.format(self.phone_mapping[self.optional_silence])

    @property
    def silence_csl(self):
        """
        A colon-separated list (as a string) of silence phone ids
        """
        if self.position_dependent_phones:
            return ':'.join(map(str, (self.phone_mapping[x] for x in self.positional_sil_phones)))
        else:
            return ':'.join(map(str, (self.phone_mapping[x] for x in self.sil_phones)))

    @property
    def phones_dir(self):
        """
        Directory to store information Kaldi needs about phones
        """
        return os.path.join(self.output_directory, 'phones')

    @property
    def phones(self):
        """
        The set of all phones (silence and non-silence)
        """
        return self.sil_phones | self.nonsil_phones

    def write(self):
        """
        Write the files necessary for Kaldi
        """
        print('Creating dictionary information...')
        os.makedirs(self.phones_dir, exist_ok=True)
        self.generate_mappings()
        self._write_graphemes()
        self._write_phone_map_file()
        self._write_phone_sets()
        self._write_phone_symbol_table()
        self._write_disambig()
        self._write_topo()
        self._write_word_boundaries()
        self._write_extra_questions()
        self._write_word_file()
        self._write_fst_text()
        self._write_fst_text(disambig=True)
        self._write_fst_binary()
        self._write_fst_binary(disambig=True)
        # self.cleanup()

    def cleanup(self):
        """
        Clean up temporary files in the output directory
        """
        os.remove(os.path.join(self.output_directory, 'temp.fst'))
        os.remove(os.path.join(self.output_directory, 'lexicon.text.fst'))

    def _write_graphemes(self):
        outfile = os.path.join(self.output_directory, 'graphemes.txt')
        with open(outfile, 'w', encoding='utf8') as f:
            for char in sorted(self.graphemes):
                f.write(char + '\n')

    def export_lexicon(self, path, disambig=False, probability=False):
        with open(path, 'w', encoding='utf8') as f:
            for w in sorted(self.words.keys()):
                for p in sorted(self.words[w]):
                    phones = ' '.join(p[0])
                    if disambig and p[2] is not None:
                        phones += ' #{}'.format(p[2])
                    if probability:
                        f.write('{}\t{}\t{}\n'.format(w, p[1], phones))
                    else:
                        f.write('{}\t{}\n'.format(w, phones))

    def _write_phone_map_file(self):
        outfile = os.path.join(self.output_directory, 'phone_map.txt')
        with open(outfile, 'w', encoding='utf8') as f:
            for sp in self.sil_phones:
                if self.position_dependent_phones:
                    new_phones = [sp + x for x in ['', ''] + self.positions]
                else:
                    new_phones = [sp]
                f.write(' '.join(new_phones) + '\n')
            for nsp in self.nonsil_phones:
                if self.position_dependent_phones:
                    new_phones = [nsp + x for x in [''] + self.positions]
                else:
                    new_phones = [nsp]
                f.write(' '.join(new_phones) + '\n')

    def _write_phone_symbol_table(self):
        outfile = os.path.join(self.output_directory, 'phones.txt')
        with open(outfile, 'w', encoding='utf8') as f:
            for p, i in sorted(self.phone_mapping.items(), key=lambda x: x[1]):
                f.write('{} {}\n'.format(p, i))

    def _write_word_boundaries(self):
        boundary_path = os.path.join(self.output_directory, 'phones', 'word_boundary.txt')
        boundary_int_path = os.path.join(self.output_directory, 'phones', 'word_boundary.int')
        with open(boundary_path, 'w', encoding='utf8') as f, \
                open(boundary_int_path, 'w', encoding='utf8') as intf:
            if self.position_dependent_phones:
                for p in sorted(self.phone_mapping.keys(), key=lambda x: self.phone_mapping[x]):
                    if p == '<eps>':
                        continue
                    cat = 'nonword'
                    if p.endswith('_B'):
                        cat = 'begin'
                    elif p.endswith('_S'):
                        cat = 'singleton'
                    elif p.endswith('_I'):
                        cat = 'internal'
                    elif p.endswith('_E'):
                        cat = 'end'
                    f.write(' '.join([p, cat]) + '\n')
                    intf.write(' '.join([str(self.phone_mapping[p]), cat]) + '\n')

    def _write_word_file(self):
        words_path = os.path.join(self.output_directory, 'words.txt')

        with open(words_path, 'w', encoding='utf8') as f:
            for w, i in sorted(self.words_mapping.items(), key=lambda x: x[1]):
                f.write('{} {}\n'.format(w, i))

    def _write_topo(self):
        filepath = os.path.join(self.output_directory, 'topo')
        sil_transp = 1 / (self.num_sil_states - 1)
        initial_transition = [self.topo_transition_template.format(x, sil_transp)
                              for x in range(self.num_sil_states - 1)]
        middle_transition = [self.topo_transition_template.format(x, sil_transp)
                             for x in range(1, self.num_sil_states)]
        final_transition = [self.topo_transition_template.format(self.num_sil_states - 1, 0.75),
                            self.topo_transition_template.format(self.num_sil_states, 0.25)]
        with open(filepath, 'w') as f:
            f.write('<Topology>\n')
            f.write("<TopologyEntry>\n")
            f.write("<ForPhones>\n")
            if self.position_dependent_phones:
                phones = self.positional_nonsil_phones
            else:
                phones = sorted(self.nonsil_phones)
            f.write("{}\n".format(' '.join(str(self.phone_mapping[x]) for x in phones)))
            f.write("</ForPhones>\n")
            states = [self.topo_template.format(cur_state=x, next_state=x + 1)
                      for x in range(self.num_nonsil_states)]
            f.write('\n'.join(states))
            f.write("\n<State> {} </State>\n".format(self.num_nonsil_states))
            f.write("</TopologyEntry>\n")

            f.write("<TopologyEntry>\n")
            f.write("<ForPhones>\n")
            if self.position_dependent_phones:
                phones = self.positional_sil_phones
            else:
                phones = self.sil_phones
            f.write("{}\n".format(' '.join(str(self.phone_mapping[x]) for x in phones)))
            f.write("</ForPhones>\n")
            states = []
            for i in range(self.num_sil_states):
                if i == 0:
                    transition = ' '.join(initial_transition)
                elif i == self.num_sil_states - 1:
                    transition = ' '.join(final_transition)
                else:
                    transition = ' '.join(middle_transition)
                states.append(self.topo_sil_template.format(cur_state=i, transitions=transition))
            f.write('\n'.join(states))
            f.write("\n<State> {} </State>\n".format(self.num_sil_states))
            f.write("</TopologyEntry>\n")
            f.write("</Topology>\n")

    def _write_phone_sets(self):
        sharesplit = ['shared', 'split']
        if not self.shared_silence_phones:
            sil_sharesplit = ['not-shared', 'not-split']
        else:
            sil_sharesplit = sharesplit

        sets_file = os.path.join(self.output_directory, 'phones', 'sets.txt')
        roots_file = os.path.join(self.output_directory, 'phones', 'roots.txt')

        sets_int_file = os.path.join(self.output_directory, 'phones', 'sets.int')
        roots_int_file = os.path.join(self.output_directory, 'phones', 'roots.int')

        with open(sets_file, 'w', encoding='utf8') as setf, \
                open(roots_file, 'w', encoding='utf8') as rootf, \
                open(sets_int_file, 'w', encoding='utf8') as setintf, \
                open(roots_int_file, 'w', encoding='utf8') as rootintf:

            # process silence phones
            for i, sp in enumerate(self.sil_phones):
                if self.position_dependent_phones:
                    mapped = [sp + x for x in [''] + self.positions]
                else:
                    mapped = [sp]
                setf.write(' '.join(mapped) + '\n')
                setintf.write(' '.join(map(str, (self.phone_mapping[x] for x in mapped))) + '\n')
                if i == 0:
                    line = sil_sharesplit + mapped
                    lineint = sil_sharesplit + [self.phone_mapping[x] for x in mapped]
                else:
                    line = sharesplit + mapped
                    lineint = sharesplit + [self.phone_mapping[x] for x in mapped]
                rootf.write(' '.join(line) + '\n')
                rootintf.write(' '.join(map(str, lineint)) + '\n')

            # process nonsilence phones
            for nsp in sorted(self.nonsil_phones):
                if self.position_dependent_phones:
                    mapped = [nsp + x for x in self.positions]
                else:
                    mapped = [nsp]
                setf.write(' '.join(mapped) + '\n')
                setintf.write(' '.join(map(str, (self.phone_mapping[x] for x in mapped))) + '\n')
                line = sharesplit + mapped
                lineint = sharesplit + [self.phone_mapping[x] for x in mapped]
                rootf.write(' '.join(line) + '\n')
                rootintf.write(' '.join(map(str, lineint)) + '\n')

    def _write_extra_questions(self):
        phone_extra = os.path.join(self.phones_dir, 'extra_questions.txt')
        phone_extra_int = os.path.join(self.phones_dir, 'extra_questions.int')
        with open(phone_extra, 'w', encoding='utf8') as outf, \
                open(phone_extra_int, 'w', encoding='utf8') as intf:
            if self.position_dependent_phones:
                sils = sorted(self.positional_sil_phones)
            else:
                sils = sorted(self.sil_phones)
            outf.write(' '.join(sils) + '\n')
            intf.write(' '.join(map(str, (self.phone_mapping[x] for x in sils))) + '\n')

            if self.position_dependent_phones:
                nonsils = sorted(self.positional_nonsil_phones)
            else:
                nonsils = sorted(self.nonsil_phones)
            outf.write(' '.join(nonsils) + '\n')
            intf.write(' '.join(map(str, (self.phone_mapping[x] for x in nonsils))) + '\n')
            if self.position_dependent_phones:
                for p in self.positions:
                    line = [x + p for x in sorted(self.nonsil_phones)]
                    outf.write(' '.join(line) + '\n')
                    intf.write(' '.join(map(str, (self.phone_mapping[x] for x in line))) + '\n')
                for p in [''] + self.positions:
                    line = [x + p for x in sorted(self.sil_phones)]
                    outf.write(' '.join(line) + '\n')
                    intf.write(' '.join(map(str, (self.phone_mapping[x] for x in line))) + '\n')

    def _write_disambig(self):
        disambig = os.path.join(self.phones_dir, 'disambig.txt')
        disambig_int = os.path.join(self.phones_dir, 'disambig.int')
        with open(disambig, 'w', encoding='utf8') as outf, \
                open(disambig_int, 'w', encoding='utf8') as intf:
            for d in sorted(self.disambig):
                outf.write('{}\n'.format(d))
                intf.write('{}\n'.format(self.phone_mapping[d]))

    def _write_fst_binary(self, disambig=False):
        if disambig:
            lexicon_fst_path = os.path.join(self.output_directory, 'lexicon_disambig.text.fst')
            output_fst = os.path.join(self.output_directory, 'L_disambig.fst')
        else:
            lexicon_fst_path = os.path.join(self.output_directory, 'lexicon.text.fst')
            output_fst = os.path.join(self.output_directory, 'L.fst')

        phones_file_path = os.path.join(self.output_directory, 'phones.txt')
        words_file_path = os.path.join(self.output_directory, 'words.txt')

        log_path = os.path.join(self.output_directory, 'fst.log')
        temp_fst_path = os.path.join(self.output_directory, 'temp.fst')
        subprocess.call([thirdparty_binary('fstcompile'), '--isymbols={}'.format(phones_file_path),
                         '--osymbols={}'.format(words_file_path),
                         '--keep_isymbols=false', '--keep_osymbols=false',
                         lexicon_fst_path, temp_fst_path])

        subprocess.call([thirdparty_binary('fstarcsort'), '--sort_type=olabel',
                         temp_fst_path, output_fst])
        if self.debug:
            dot_path = os.path.join(self.output_directory, 'L.dot')
            with open(log_path, 'w') as logf:
                draw_proc = subprocess.Popen([thirdparty_binary('fstdraw'), '--portrait=true',
                                              '--isymbols={}'.format(phones_file_path),
                                              '--osymbols={}'.format(words_file_path), output_fst, dot_path],
                                             stderr=logf)
                draw_proc.communicate()
                dot_proc = subprocess.Popen([thirdparty_binary('dot'), '-Tpdf', '-O', dot_path], stderr=logf)
                dot_proc.communicate()

    def _write_fst_text(self, disambig=False):
        if disambig:
            lexicon_fst_path = os.path.join(self.output_directory, 'lexicon_disambig.text.fst')
        else:
            lexicon_fst_path = os.path.join(self.output_directory, 'lexicon.text.fst')
        if self.sil_prob != 0:
            silphone = self.optional_silence
            nonoptsil = self.nonoptional_silence

            def is_sil(element):
                return element in [silphone, silphone + '_S']

            silcost = -1 * math.log(self.sil_prob)
            nosilcost = -1 * math.log(1.0 - self.sil_prob)
            startstate = 0
            loopstate = 1
            silstate = 2
        else:
            loopstate = 0
            nextstate = 1

        with open(lexicon_fst_path, 'w', encoding='utf8') as outf:
            if self.sil_prob != 0:
                outf.write('\t'.join(map(str, [startstate, loopstate, '<eps>', '<eps>', nosilcost])) + '\n')

                outf.write('\t'.join(map(str, [startstate, loopstate, nonoptsil, '<eps>', silcost])) + "\n")
                outf.write('\t'.join(map(str, [silstate, loopstate, silphone, '<eps>'])) + "\n")
                nextstate = 3
            for w in sorted(self.words.keys()):
                for phones, prob, disambig_symbol in sorted(self.words[w]):
                    phones = [x for x in phones]
                    if self.position_dependent_phones:
                        if len(phones) == 1:
                            phones[0] += '_S'
                        else:
                            for i in range(len(phones)):
                                if i == 0:
                                    phones[i] += '_B'
                                elif i == len(phones) - 1:
                                    phones[i] += '_E'
                                else:
                                    phones[i] += '_I'
                    if not self.pronunciation_probabilities:
                        pron_cost = 0
                    else:
                        if prob is None:
                            prob = 1.0
                        pron_cost = -1 * math.log(prob)

                    pron_cost_string = ''
                    if pron_cost != 0:
                        pron_cost_string = '\t{}'.format(pron_cost)

                    s = loopstate
                    word_or_eps = w
                    local_nosilcost = nosilcost + pron_cost
                    local_silcost = silcost + pron_cost
                    while len(phones) > 0:
                        p = phones.pop(0)
                        if len(phones) > 0 or (disambig and disambig_symbol is not None):
                            ns = nextstate
                            nextstate += 1
                            outf.write('\t'.join(map(str, [s, ns, p, word_or_eps])) + pron_cost_string + '\n')
                            word_or_eps = '<eps>'
                            pron_cost_string = ""
                            pron_cost = 0.0
                            s = ns
                        elif self.sil_prob == 0:
                            ns = loopstate
                            outf.write('\t'.join(map(str, [s, ns, p, word_or_eps])) + pron_cost_string + '\n')
                            word_or_eps = '<eps>'
                            pron_cost_string = ""
                            s = ns
                        else:
                            outf.write('\t'.join(map(str, [s, loopstate, p, word_or_eps, local_nosilcost])) + "\n")
                            outf.write('\t'.join(map(str, [s, silstate, p, word_or_eps, local_silcost])) + "\n")
                    if disambig and disambig_symbol is not None:
                        outf.write('\t'.join(map(str, [s, loopstate, '#{}'.format(disambig_symbol), word_or_eps,
                                                       local_nosilcost])) + "\n")
                        outf.write('\t'.join(
                            map(str, [s, silstate, '#{}'.format(disambig_symbol), word_or_eps, local_silcost])) + "\n")

            outf.write("{}\t{}\n".format(loopstate, 0))


class OrthographicDictionary(Dictionary):
    def __init__(self, input_dict, output_directory, oov_code='<unk>',
                 position_dependent_phones=True, num_sil_states=5,
                 num_nonsil_states=3, shared_silence_phones=False,
                 pronunciation_probabilities=True,
                 sil_prob=0.5, debug=False):
        self.debug = debug
        self.output_directory = os.path.join(output_directory, 'dictionary')
        self.num_sil_states = num_sil_states
        self.num_nonsil_states = num_nonsil_states
        self.shared_silence_phones = shared_silence_phones
        self.sil_prob = sil_prob
        self.oov_code = oov_code
        self.position_dependent_phones = position_dependent_phones
        self.pronunciation_probabilities = pronunciation_probabilities

        self.words = defaultdict(list)
        self.nonsil_phones = set()
        self.sil_phones = {'sp', 'spn', 'sil'}
        self.optional_silence = 'sp'
        self.nonoptional_silence = 'sil'
        self.graphemes = set()
        for w in input_dict:
            self.graphemes.update(w)
            pron = tuple(input_dict[w])
            self.words[w].append((pron, None))
            self.nonsil_phones.update(pron)
        self.word_pattern = compile_graphemes(self.graphemes)
        self.words['!SIL'].append((('sil',), None))
        self.words[self.oov_code].append((('spn',), None))
        self.phone_mapping = {}
        i = 0
        self.phone_mapping['<eps>'] = i
        if self.position_dependent_phones:
            for p in self.positional_sil_phones:
                i += 1
                self.phone_mapping[p] = i
            for p in self.positional_nonsil_phones:
                i += 1
                self.phone_mapping[p] = i
        else:
            for p in sorted(self.sil_phones):
                i += 1
                self.phone_mapping[p] = i
            for p in sorted(self.nonsil_phones):
                i += 1
                self.phone_mapping[p] = i

        self.words_mapping = {}
        i = 0
        self.words_mapping['<eps>'] = i
        for w in sorted(self.words.keys()):
            i += 1
            self.words_mapping[w] = i

        self.words_mapping['#0'] = i + 1
        self.words_mapping['<s>'] = i + 2
        self.words_mapping['</s>'] = i + 3

        self.oovs_found = set()
        self.add_disambiguation()