| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- import os
- from pathlib import Path
- from typing import List, Tuple, Union
- from torch import Tensor
- from torch.hub import download_url_to_file
- from torch.utils.data import Dataset
- from torchaudio.datasets.librispeech import load_librispeech_item
- from torchaudio.datasets.utils import extract_archive
- _ARCHIVE_NAME = "librispeech_finetuning"
- _URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
- _CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
- def _get_fileids_paths(path, subset, _ext_audio) -> List[Tuple[str, str]]:
- """Get the file names and the corresponding file paths without `speaker_id`
- and `chapter_id` directories.
- The format of path is like:
- {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
- {root}/{_ARCHIVE_NAME}/9h/[clean, other]
- """
- if subset == "10min":
- files_paths = [
- (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
- for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio)
- ]
- elif subset in ["1h", "10h"]:
- files_paths = [
- (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
- for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)
- ]
- if subset == "10h":
- files_paths += [
- (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
- for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)
- ]
- else:
- raise ValueError(f"Unsupported subset value. Found {subset}.")
- files_paths = sorted(files_paths, key=lambda x: x[0] + x[1])
- return files_paths
- class LibriLightLimited(Dataset):
- """Create a Dataset for LibriLightLimited, which is the supervised subset of
- LibriLight dataset.
- Args:
- root (str or Path): Path to the directory where the dataset is found or downloaded.
- subset (str, optional): The subset to use. Options: [``10min``, ``1h``, ``10h``]
- (Default: ``10min``).
- download (bool, optional):
- Whether to download the dataset if it is not found at root path. (default: ``False``).
- """
- _ext_txt = ".trans.txt"
- _ext_audio = ".flac"
- def __init__(
- self,
- root: Union[str, Path],
- subset: str = "10min",
- download: bool = False,
- ) -> None:
- assert subset in ["10min", "1h", "10h"], "`subset` must be one of ['10min', '1h', '10h']"
- root = os.fspath(root)
- self._path = os.path.join(root, _ARCHIVE_NAME)
- archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz")
- if not os.path.isdir(self._path):
- if not download:
- raise RuntimeError("Dataset not found. Please use `download=True` to download")
- if not os.path.isfile(archive):
- download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
- extract_archive(archive)
- self._fileids_paths = _get_fileids_paths(self._path, subset, self._ext_audio)
- def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
- """Load the n-th sample from the dataset.
- Args:
- n (int): The index of the sample to be loaded
- Returns:
- (Tensor, int, str, int, int, int):
- ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
- """
- file_path, fileid = self._fileids_paths[n]
- return load_librispeech_item(fileid, file_path, self._ext_audio, self._ext_txt)
- def __len__(self) -> int:
- return len(self._fileids_paths)
|