| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- import os
- from pathlib import Path
- from typing import Tuple, Union
- import torchaudio
- from torch import Tensor
- from torch.hub import download_url_to_file
- from torch.utils.data import Dataset
- from torchaudio.datasets.utils import extract_archive
- _RELEASE_CONFIGS = {
- "release1": {
- "folder_in_archive": "TEDLIUM_release1",
- "url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz",
- "checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27",
- "data_path": "",
- "subset": "train",
- "supported_subsets": ["train", "test", "dev"],
- "dict": "TEDLIUM.150K.dic",
- },
- "release2": {
- "folder_in_archive": "TEDLIUM_release2",
- "url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz",
- "checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58",
- "data_path": "",
- "subset": "train",
- "supported_subsets": ["train", "test", "dev"],
- "dict": "TEDLIUM.152k.dic",
- },
- "release3": {
- "folder_in_archive": "TEDLIUM_release-3",
- "url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz",
- "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb",
- "data_path": "data/",
- "subset": "train",
- "supported_subsets": ["train", "test", "dev"],
- "dict": "TEDLIUM.152k.dic",
- },
- }
- class TEDLIUM(Dataset):
- """
- Create a Dataset for *Tedlium* [:footcite:`rousseau2012tedlium`]. It supports releases 1,2 and 3.
- Args:
- root (str or Path): Path to the directory where the dataset is found or downloaded.
- release (str, optional): Release version.
- Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``.
- (default: ``"release1"``).
- subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``,
- and ``"test"``. Defaults to ``"train"``.
- download (bool, optional):
- Whether to download the dataset if it is not found at root path. (default: ``False``).
- audio_ext (str, optional): extension for audio file (default: ``".sph"``)
- """
- def __init__(
- self,
- root: Union[str, Path],
- release: str = "release1",
- subset: str = "train",
- download: bool = False,
- audio_ext: str = ".sph",
- ) -> None:
- self._ext_audio = audio_ext
- if release in _RELEASE_CONFIGS.keys():
- folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"]
- url = _RELEASE_CONFIGS[release]["url"]
- subset = subset if subset else _RELEASE_CONFIGS[release]["subset"]
- else:
- # Raise warning
- raise RuntimeError(
- "The release {} does not match any of the supported tedlium releases{} ".format(
- release,
- _RELEASE_CONFIGS.keys(),
- )
- )
- if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]:
- # Raise warning
- raise RuntimeError(
- "The subset {} does not match any of the supported tedlium subsets{} ".format(
- subset,
- _RELEASE_CONFIGS[release]["supported_subsets"],
- )
- )
- # Get string representation of 'root' in case Path object is passed
- root = os.fspath(root)
- basename = os.path.basename(url)
- archive = os.path.join(root, basename)
- basename = basename.split(".")[0]
- if release == "release3":
- if subset == "train":
- self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"])
- else:
- self._path = os.path.join(root, folder_in_archive, "legacy", subset)
- else:
- self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"], subset)
- if download:
- if not os.path.isdir(self._path):
- if not os.path.isfile(archive):
- checksum = _RELEASE_CONFIGS[release]["checksum"]
- download_url_to_file(url, archive, hash_prefix=checksum)
- extract_archive(archive)
- else:
- if not os.path.exists(self._path):
- raise RuntimeError(
- f"The path {self._path} doesn't exist. "
- "Please check the ``root`` path or set `download=True` to download it"
- )
- # Create list for all samples
- self._filelist = []
- stm_path = os.path.join(self._path, "stm")
- for file in sorted(os.listdir(stm_path)):
- if file.endswith(".stm"):
- stm_path = os.path.join(self._path, "stm", file)
- with open(stm_path) as f:
- l = len(f.readlines())
- file = file.replace(".stm", "")
- self._filelist.extend((file, line) for line in range(l))
- # Create dict path for later read
- self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"])
- self._phoneme_dict = None
- def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]:
- """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name.
- Args:
- fileid (str): File id to identify both text and audio files corresponding to the sample
- line (int): Line identifier for the sample inside the text file
- path (str): Dataset root path
- Returns:
- (Tensor, int, str, int, int, int):
- ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
- """
- transcript_path = os.path.join(path, "stm", fileid)
- with open(transcript_path + ".stm") as f:
- transcript = f.readlines()[line]
- talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6)
- wave_path = os.path.join(path, "sph", fileid)
- waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time)
- return (waveform, sample_rate, transcript, talk_id, speaker_id, identifier)
- def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]:
- """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
- and load individual sentences from a full ted audio talk file.
- Args:
- path (str): Path to audio file
- start_time (int): Time in seconds where the sample sentence stars
- end_time (int): Time in seconds where the sample sentence finishes
- sample_rate (float, optional): Sampling rate
- Returns:
- [Tensor, int]: Audio tensor representation and sample rate
- """
- start_time = int(float(start_time) * sample_rate)
- end_time = int(float(end_time) * sample_rate)
- kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time}
- return torchaudio.load(path, **kwargs)
- def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
- """Load the n-th sample from the dataset.
- Args:
- n (int): The index of the sample to be loaded
- Returns:
- tuple: ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
- """
- fileid, line = self._filelist[n]
- return self._load_tedlium_item(fileid, line, self._path)
- def __len__(self) -> int:
- """TEDLIUM dataset custom function overwritting len default behaviour.
- Returns:
- int: TEDLIUM dataset length
- """
- return len(self._filelist)
- @property
- def phoneme_dict(self):
- """dict[str, tuple[str]]: Phonemes. Mapping from word to tuple of phonemes.
- Note that some words have empty phonemes.
- """
- # Read phoneme dictionary
- if not self._phoneme_dict:
- self._phoneme_dict = {}
- with open(self._dict_path, "r", encoding="utf-8") as f:
- for line in f.readlines():
- content = line.strip().split()
- self._phoneme_dict[content[0]] = tuple(content[1:]) # content[1:] can be empty list
- return self._phoneme_dict.copy()
|