| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- import os
- from typing import Tuple
- import torchaudio
- from torch import Tensor
- from torch.hub import download_url_to_file
- from torch.utils.data import Dataset
- from torchaudio.datasets.utils import extract_archive
- URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
- _CHECKSUMS = {
- "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "f96258be9fdc2cbff6559541aae7ea4f59df3fcaf5cf963aae5ca647357e359c" # noqa: E501
- }
- SampleType = Tuple[Tensor, int, str, str, str]
- class VCTK_092(Dataset):
- """Create *VCTK 0.92* [:footcite:`yamagishi2019vctk`] Dataset
- Args:
- root (str): Root directory where the dataset's top level directory is found.
- mic_id (str, optional): Microphone ID. Either ``"mic1"`` or ``"mic2"``. (default: ``"mic2"``)
- download (bool, optional):
- Whether to download the dataset if it is not found at root path. (default: ``False``).
- url (str, optional): The URL to download the dataset from.
- (default: ``"https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"``)
- audio_ext (str, optional): Custom audio extension if dataset is converted to non-default audio format.
- Note:
- * All the speeches from speaker ``p315`` will be skipped due to the lack of the corresponding text files.
- * All the speeches from ``p280`` will be skipped for ``mic_id="mic2"`` due to the lack of the audio files.
- * Some of the speeches from speaker ``p362`` will be skipped due to the lack of the audio files.
- * See Also: https://datashare.is.ed.ac.uk/handle/10283/3443
- """
- def __init__(
- self,
- root: str,
- mic_id: str = "mic2",
- download: bool = False,
- url: str = URL,
- audio_ext=".flac",
- ):
- if mic_id not in ["mic1", "mic2"]:
- raise RuntimeError(f'`mic_id` has to be either "mic1" or "mic2". Found: {mic_id}')
- archive = os.path.join(root, "VCTK-Corpus-0.92.zip")
- self._path = os.path.join(root, "VCTK-Corpus-0.92")
- self._txt_dir = os.path.join(self._path, "txt")
- self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed")
- self._mic_id = mic_id
- self._audio_ext = audio_ext
- if download:
- if not os.path.isdir(self._path):
- if not os.path.isfile(archive):
- checksum = _CHECKSUMS.get(url, None)
- download_url_to_file(url, archive, hash_prefix=checksum)
- extract_archive(archive, self._path)
- if not os.path.isdir(self._path):
- raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
- # Extracting speaker IDs from the folder structure
- self._speaker_ids = sorted(os.listdir(self._txt_dir))
- self._sample_ids = []
- """
- Due to some insufficient data complexity in the 0.92 version of this dataset,
- we start traversing the audio folder structure in accordance with the text folder.
- As some of the audio files are missing of either ``mic_1`` or ``mic_2`` but the
- text is present for the same, we first check for the existence of the audio file
- before adding it to the ``sample_ids`` list.
- Once the ``audio_ids`` are loaded into memory we can quickly access the list for
- different parameters required by the user.
- """
- for speaker_id in self._speaker_ids:
- if speaker_id == "p280" and mic_id == "mic2":
- continue
- utterance_dir = os.path.join(self._txt_dir, speaker_id)
- for utterance_file in sorted(f for f in os.listdir(utterance_dir) if f.endswith(".txt")):
- utterance_id = os.path.splitext(utterance_file)[0]
- audio_path_mic = os.path.join(
- self._audio_dir,
- speaker_id,
- f"{utterance_id}_{mic_id}{self._audio_ext}",
- )
- if speaker_id == "p362" and not os.path.isfile(audio_path_mic):
- continue
- self._sample_ids.append(utterance_id.split("_"))
- def _load_text(self, file_path) -> str:
- with open(file_path) as file_path:
- return file_path.readlines()[0]
- def _load_audio(self, file_path) -> Tuple[Tensor, int]:
- return torchaudio.load(file_path)
- def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> SampleType:
- transcript_path = os.path.join(self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt")
- audio_path = os.path.join(
- self._audio_dir,
- speaker_id,
- f"{speaker_id}_{utterance_id}_{mic_id}{self._audio_ext}",
- )
- # Reading text
- transcript = self._load_text(transcript_path)
- # Reading FLAC
- waveform, sample_rate = self._load_audio(audio_path)
- return (waveform, sample_rate, transcript, speaker_id, utterance_id)
- def __getitem__(self, n: int) -> SampleType:
- """Load the n-th sample from the dataset.
- Args:
- n (int): The index of the sample to be loaded
- Returns:
- (Tensor, int, str, str, str):
- ``(waveform, sample_rate, transcript, speaker_id, utterance_id)``
- """
- speaker_id, utterance_id = self._sample_ids[n]
- return self._load_sample(speaker_id, utterance_id, self._mic_id)
- def __len__(self) -> int:
- return len(self._sample_ids)
|