Source code for aligner.models

import os
import pickle
import yaml

from tempfile import mkdtemp
from shutil import copy, copyfile, rmtree, make_archive, unpack_archive

# default format for output
FORMAT = "zip"

from . import __version__
from .exceptions import PronunciationAcousticMismatchError, PronunciationOrthographyMismatchError


class Archive(object):
    """
    Class representing data in a directory or archive file (zip, tar,
    tar.gz/tgz)

    Largely duplicated from the prosodylab-aligner
    (https://github.com/prosodylab/Prosodylab-Aligner) archive class.
    """

    def __init__(self, source, is_tmpdir=False):
        self._meta = {}
        self.name, _ = os.path.splitext(os.path.basename(source))
        if os.path.isdir(source):
            self.dirname = os.path.abspath(source)
            self.is_tmpdir = is_tmpdir  # trust caller
        else:
            base = mkdtemp(dir=os.environ.get("TMPDIR", None))
            unpack_archive(source, base)
            (head, tail, _) = next(os.walk(base))
            if not tail:
                raise ValueError("'{}' is empty.".format(source))
            if len(tail) > 1:
                raise ValueError("'{}' is a bomb.".format(source))
            self.dirname = os.path.join(head, tail[0])
            self.is_tmpdir = True  # ignore caller

    @classmethod
    def empty(cls, head):
        """
        Initialize an archive using an empty directory
        """
        base = mkdtemp(dir=os.environ.get("TMPDIR", None))
        source = os.path.join(base, head)
        os.makedirs(source, exist_ok=True)
        return cls(source, True)

    def add(self, source):
        """
        Add file into archive
        """
        copy(source, self.dirname)

    def __repr__(self):
        return "{}(dirname={!r})".format(self.__class__.__name__,
                                         self.dirname)

    def dump(self, sink, archive_fmt=FORMAT):
        """
        Write archive to disk, and return the name of final archive
        """
        return make_archive(sink, archive_fmt,
                            *os.path.split(self.dirname))

    def __del__(self):
        if self.is_tmpdir:
            rmtree(self.dirname)


[docs]class AcousticModel(Archive): def add_meta_file(self, aligner): with open(os.path.join(self.dirname, 'meta.yaml'), 'w') as f: yaml.dump(aligner.meta, f) @property def meta(self): if not self._meta: meta_path = os.path.join(self.dirname, 'meta.yaml') if not os.path.exists(meta_path): self._meta = {'version': '0.9.0', 'architecture': 'gmm-hmm'} else: with open(meta_path, 'r') as f: self._meta = yaml.load(f) self._meta['phones'] = set(self._meta.get('phones', [])) return self._meta def add_triphone_model(self, source): """ Add file into archive """ copyfile(os.path.join(source, 'final.mdl'), os.path.join(self.dirname, 'ali-final.mdl')) copyfile(os.path.join(source, 'final.occs'), os.path.join(self.dirname, 'ali-final.occs')) copyfile(os.path.join(source, 'tree'), os.path.join(self.dirname, 'ali-tree')) def add_triphone_fmllr_model(self, source): """ Add file into archive """ copy(os.path.join(source, 'final.mdl'), self.dirname) copy(os.path.join(source, 'final.occs'), self.dirname) copy(os.path.join(source, 'tree'), self.dirname) def export_triphone_model(self, destination): """ """ os.makedirs(destination, exist_ok=True) copyfile(os.path.join(self.dirname, 'final.mdl'), os.path.join(destination, 'final.mdl')) copyfile(os.path.join(self.dirname, 'final.occs'), os.path.join(destination, 'final.occs')) copyfile(os.path.join(self.dirname, 'tree'), os.path.join(destination, 'tree')) def export_triphone_fmllr_model(self, destination): """ """ os.makedirs(destination, exist_ok=True) copy(os.path.join(self.dirname, 'final.mdl'), destination) copy(os.path.join(self.dirname, 'final.occs'), destination) copy(os.path.join(self.dirname, 'tree'), destination) def validate(self, dictionary): if isinstance(dictionary, G2PModel): missing_phones = dictionary.meta['phones'] - set(self.meta['phones']) else: missing_phones = dictionary.nonsil_phones - set(self.meta['phones']) if missing_phones: raise (PronunciationAcousticMismatchError(missing_phones))
[docs]class G2PModel(Archive): def add_meta_file(self, dictionary): with open(os.path.join(self.dirname, 'meta.yaml'), 'w') as f: meta = {'phones': sorted(dictionary.nonsil_phones), 'graphemes': sorted(dictionary.graphemes), 'architecture': 'phonetisaurus', 'version': __version__} yaml.dump(meta, f) @property def meta(self): if not self._meta: meta_path = os.path.join(self.dirname, 'meta.yaml') if not os.path.exists(meta_path): self._meta = {'version': '0.9.0', 'architecture': 'phonetisaurus'} else: with open(meta_path, 'r') as f: self._meta = yaml.load(f) self._meta['phones'] = set(self._meta.get('phones', [])) self._meta['graphemes'] = set(self._meta.get('graphemes', [])) return self._meta @property def fst_path(self): return os.path.join(self.dirname, 'model.fst') def add_fst_model(self, source): """ Add file into archive """ copyfile(os.path.join(source, 'model.fst'), self.fst_path) def export_fst_model(self, destination): os.makedirs(destination, exist_ok=True) copy(self.fst_path, destination) def validate(self, corpus): return True # FIXME add actual validation