"""Class definitions for Speakers, Files, Utterances and Jobs"""
from __future__ import annotations
import os
import sys
import traceback
import typing
from typing import TYPE_CHECKING, Optional, Union
from praatio import textgrid
from montreal_forced_aligner.corpus.helper import get_wav_info, load_text
from montreal_forced_aligner.data import SoundFileInformation, TextFileType
from montreal_forced_aligner.exceptions import TextGridParseError, TextParseError
if TYPE_CHECKING:
from dataclasses import dataclass
else:
from dataclassy import dataclass
__all__ = ["FileData", "UtteranceData"]
[docs]
@dataclass(slots=True)
class FileData:
"""
Data class for file information
Parameters
----------
name: str
File name
wav_path: str, optional
Path to sound file
text_path: str, optional
Path to sound file
relative_path: str
Path relative to corpus root directory
wav_info: dict[str, Any]
Information dictionary about the sound file
speaker_ordering: list[str]
List of speakers in the file
utterances: list[:class:`~montreal_forced_aligner.corpus.classes.UtteranceData`]
Utterance data for the file
"""
name: str
wav_path: typing.Optional[str]
text_path: typing.Optional[str]
text_type: TextFileType
relative_path: str
wav_info: SoundFileInformation = None
speaker_ordering: typing.List[str] = []
utterances: typing.List[UtteranceData] = []
[docs]
@classmethod
def parse_file(
cls,
file_name: str,
wav_path: Optional[str],
text_path: Optional[str],
relative_path: str,
speaker_characters: Union[int, str],
enforce_sample_rate: Optional[int] = None,
):
"""
Parse a collection of sound file and transcription file into a File
Parameters
----------
file_name: str
File identifier
wav_path: str
Full sound file path
text_path: str
Full transcription path
relative_path: str
Relative path from the corpus directory root
speaker_characters: int, optional
Number of characters in the file name to specify the speaker
sanitize_function: Callable, optional
Function to sanitize words and strip punctuation
Returns
-------
:class:`~montreal_forced_aligner.corpus.classes.FileData`
Parsed file
"""
text_type: TextFileType = TextFileType.NONE
if text_path is not None:
if str(text_path).lower().endswith(".textgrid"):
text_type = TextFileType.TEXTGRID
else:
text_type = TextFileType.LAB
file = FileData(
file_name, wav_path, text_path, relative_path=relative_path, text_type=text_type
)
if wav_path is not None:
root = os.path.dirname(wav_path)
file.wav_info = get_wav_info(
wav_path,
enforce_mono=file.text_type is TextFileType.LAB,
enforce_sample_rate=enforce_sample_rate,
)
else:
root = os.path.dirname(text_path)
if not speaker_characters:
speaker_name = os.path.basename(root)
elif isinstance(speaker_characters, int):
speaker_name = file_name[:speaker_characters]
elif speaker_characters == "prosodylab":
speaker_name = file_name.split("_")[1]
else:
speaker_name = file_name
root_speaker = None
if speaker_characters or file.text_type != TextFileType.TEXTGRID:
root_speaker = speaker_name
file.load_text(
root_speaker=root_speaker,
)
return file
[docs]
def load_text(
self,
root_speaker: Optional[str] = None,
) -> None:
"""
Load the transcription text from the text_file of the object
Parameters
----------
root_speaker: str, optional
Speaker derived from the root directory, ignored for TextGrids
"""
if self.text_type == TextFileType.LAB:
try:
text = load_text(self.text_path)
except UnicodeDecodeError:
raise TextParseError(self.text_path)
if self.wav_info is None:
end = -1
else:
end = self.wav_info.duration
utterance = UtteranceData(
speaker_name=root_speaker,
file_name=self.name,
text=text,
begin=0,
channel=0,
end=end,
)
self.utterances.append(utterance)
self.speaker_ordering.append(root_speaker)
elif self.text_type == TextFileType.TEXTGRID:
try:
tg = textgrid.openTextgrid(self.text_path, includeEmptyIntervals=False)
except Exception:
exc_type, exc_value, exc_traceback = sys.exc_info()
raise TextGridParseError(
self.text_path,
"\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback)),
)
num_tiers = len(tg.tierNames)
if num_tiers == 0:
raise TextGridParseError(self.text_path, "Number of tiers parsed was zero")
num_channels = 1
if self.wav_info is not None:
duration = self.wav_info.duration
num_channels = self.wav_info.num_channels
else:
duration = tg.maxTimestamp
if root_speaker:
self.speaker_ordering.append(root_speaker)
for i, tier_name in enumerate(tg.tierNames):
ti = tg._tierDict[tier_name]
if tier_name.lower() == "notes":
continue
if not isinstance(ti, textgrid.IntervalTier):
continue
if not root_speaker:
speaker_name = tier_name.strip()
self.speaker_ordering.append(speaker_name)
else:
speaker_name = root_speaker
channel = 0
if num_channels == 2 and i >= num_tiers / 2:
channel = 1
for begin, end, text in ti.entries:
text = text.strip()
if not text:
continue
begin, end = round(begin, 4), round(end, 4)
if begin >= duration:
continue
end = min(end, duration)
utt = UtteranceData(
speaker_name=speaker_name,
file_name=self.name,
begin=begin,
end=end,
text=text,
channel=channel,
)
if not utt.text:
continue
self.utterances.append(utt)
else:
if self.wav_info is not None:
duration = self.wav_info.duration
else:
duration = 1
utt = UtteranceData(
speaker_name=root_speaker,
file_name=self.name,
begin=0,
channel=0,
end=duration,
)
self.utterances.append(utt)
self.speaker_ordering.append(root_speaker)
[docs]
@dataclass(slots=True)
class UtteranceData:
"""
Data class for utterance information
Parameters
----------
speaker_name: str
Speaker name
file_name: str
File name
begin: float, optional
Begin timestamp
end: float, optional
End timestamp
channel: int, optional
Sound file channel
text: str, optional
Utterance text
oovs: set[str]
Set of words not found in a look up
"""
speaker_name: str
file_name: str
begin: float
end: float
channel: int = 0
text: str = ""
normalized_text: str = ""
normalized_character_text: str = ""
oovs: str = ""