kinetics.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. import csv
  2. import os
  3. import time
  4. import urllib
  5. import warnings
  6. from functools import partial
  7. from multiprocessing import Pool
  8. from os import path
  9. from typing import Any, Callable, Dict, Optional, Tuple
  10. from torch import Tensor
  11. from .folder import find_classes, make_dataset
  12. from .utils import download_and_extract_archive, download_url, verify_str_arg, check_integrity
  13. from .video_utils import VideoClips
  14. from .vision import VisionDataset
  15. def _dl_wrap(tarpath: str, videopath: str, line: str) -> None:
  16. download_and_extract_archive(line, tarpath, videopath)
  17. class Kinetics(VisionDataset):
  18. """`Generic Kinetics <https://www.deepmind.com/open-source/kinetics>`_
  19. dataset.
  20. Kinetics-400/600/700 are action recognition video datasets.
  21. This dataset consider every video as a collection of video clips of fixed size, specified
  22. by ``frames_per_clip``, where the step in frames between each clip is given by
  23. ``step_between_clips``.
  24. To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
  25. and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
  26. elements will come from video 1, and the next three elements from video 2.
  27. Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
  28. frames in a video might be present.
  29. Args:
  30. root (string): Root directory of the Kinetics Dataset.
  31. Directory should be structured as follows:
  32. .. code::
  33. root/
  34. ├── split
  35. │ ├── class1
  36. │ │ ├── clip1.mp4
  37. │ │ ├── clip2.mp4
  38. │ │ ├── clip3.mp4
  39. │ │ ├── ...
  40. │ ├── class2
  41. │ │ ├── clipx.mp4
  42. │ │ └── ...
  43. Note: split is appended automatically using the split argument.
  44. frames_per_clip (int): number of frames in a clip
  45. num_classes (int): select between Kinetics-400 (default), Kinetics-600, and Kinetics-700
  46. split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"`` ``"test"``
  47. frame_rate (float): If omitted, interpolate different frame rate for each clip.
  48. step_between_clips (int): number of frames between each clip
  49. transform (callable, optional): A function/transform that takes in a TxHxWxC video
  50. and returns a transformed version.
  51. download (bool): Download the official version of the dataset to root folder.
  52. num_workers (int): Use multiple workers for VideoClips creation
  53. num_download_workers (int): Use multiprocessing in order to speed up download.
  54. output_format (str, optional): The format of the output video tensors (before transforms).
  55. Can be either "THWC" or "TCHW" (default).
  56. Note that in most other utils and datasets, the default is actually "THWC".
  57. Returns:
  58. tuple: A 3-tuple with the following entries:
  59. - video (Tensor[T, C, H, W] or Tensor[T, H, W, C]): the `T` video frames in torch.uint8 tensor
  60. - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
  61. and `L` is the number of points in torch.float tensor
  62. - label (int): class of the video clip
  63. Raises:
  64. RuntimeError: If ``download is True`` and the video archives are already extracted.
  65. """
  66. _TAR_URLS = {
  67. "400": "https://s3.amazonaws.com/kinetics/400/{split}/k400_{split}_path.txt",
  68. "600": "https://s3.amazonaws.com/kinetics/600/{split}/k600_{split}_path.txt",
  69. "700": "https://s3.amazonaws.com/kinetics/700_2020/{split}/k700_2020_{split}_path.txt",
  70. }
  71. _ANNOTATION_URLS = {
  72. "400": "https://s3.amazonaws.com/kinetics/400/annotations/{split}.csv",
  73. "600": "https://s3.amazonaws.com/kinetics/600/annotations/{split}.csv",
  74. "700": "https://s3.amazonaws.com/kinetics/700_2020/annotations/{split}.csv",
  75. }
  76. def __init__(
  77. self,
  78. root: str,
  79. frames_per_clip: int,
  80. num_classes: str = "400",
  81. split: str = "train",
  82. frame_rate: Optional[int] = None,
  83. step_between_clips: int = 1,
  84. transform: Optional[Callable] = None,
  85. extensions: Tuple[str, ...] = ("avi", "mp4"),
  86. download: bool = False,
  87. num_download_workers: int = 1,
  88. num_workers: int = 1,
  89. _precomputed_metadata: Optional[Dict[str, Any]] = None,
  90. _video_width: int = 0,
  91. _video_height: int = 0,
  92. _video_min_dimension: int = 0,
  93. _audio_samples: int = 0,
  94. _audio_channels: int = 0,
  95. _legacy: bool = False,
  96. output_format: str = "TCHW",
  97. ) -> None:
  98. # TODO: support test
  99. self.num_classes = verify_str_arg(num_classes, arg="num_classes", valid_values=["400", "600", "700"])
  100. self.extensions = extensions
  101. self.num_download_workers = num_download_workers
  102. self.root = root
  103. self._legacy = _legacy
  104. if _legacy:
  105. print("Using legacy structure")
  106. self.split_folder = root
  107. self.split = "unknown"
  108. output_format = "THWC"
  109. if download:
  110. raise ValueError("Cannot download the videos using legacy_structure.")
  111. else:
  112. self.split_folder = path.join(root, split)
  113. self.split = verify_str_arg(split, arg="split", valid_values=["train", "val", "test"])
  114. if download:
  115. self.download_and_process_videos()
  116. super().__init__(self.root)
  117. self.classes, class_to_idx = find_classes(self.split_folder)
  118. self.samples = make_dataset(self.split_folder, class_to_idx, extensions, is_valid_file=None)
  119. video_list = [x[0] for x in self.samples]
  120. self.video_clips = VideoClips(
  121. video_list,
  122. frames_per_clip,
  123. step_between_clips,
  124. frame_rate,
  125. _precomputed_metadata,
  126. num_workers=num_workers,
  127. _video_width=_video_width,
  128. _video_height=_video_height,
  129. _video_min_dimension=_video_min_dimension,
  130. _audio_samples=_audio_samples,
  131. _audio_channels=_audio_channels,
  132. output_format=output_format,
  133. )
  134. self.transform = transform
  135. def download_and_process_videos(self) -> None:
  136. """Downloads all the videos to the _root_ folder in the expected format."""
  137. tic = time.time()
  138. self._download_videos()
  139. toc = time.time()
  140. print("Elapsed time for downloading in mins ", (toc - tic) / 60)
  141. self._make_ds_structure()
  142. toc2 = time.time()
  143. print("Elapsed time for processing in mins ", (toc2 - toc) / 60)
  144. print("Elapsed time overall in mins ", (toc2 - tic) / 60)
  145. def _download_videos(self) -> None:
  146. """download tarballs containing the video to "tars" folder and extract them into the _split_ folder where
  147. split is one of the official dataset splits.
  148. Raises:
  149. RuntimeError: if download folder exists, break to prevent downloading entire dataset again.
  150. """
  151. if path.exists(self.split_folder):
  152. raise RuntimeError(
  153. f"The directory {self.split_folder} already exists. "
  154. f"If you want to re-download or re-extract the images, delete the directory."
  155. )
  156. tar_path = path.join(self.root, "tars")
  157. file_list_path = path.join(self.root, "files")
  158. split_url = self._TAR_URLS[self.num_classes].format(split=self.split)
  159. split_url_filepath = path.join(file_list_path, path.basename(split_url))
  160. if not check_integrity(split_url_filepath):
  161. download_url(split_url, file_list_path)
  162. with open(split_url_filepath) as file:
  163. list_video_urls = [urllib.parse.quote(line, safe="/,:") for line in file.read().splitlines()]
  164. if self.num_download_workers == 1:
  165. for line in list_video_urls:
  166. download_and_extract_archive(line, tar_path, self.split_folder)
  167. else:
  168. part = partial(_dl_wrap, tar_path, self.split_folder)
  169. poolproc = Pool(self.num_download_workers)
  170. poolproc.map(part, list_video_urls)
  171. def _make_ds_structure(self) -> None:
  172. """move videos from
  173. split_folder/
  174. ├── clip1.avi
  175. ├── clip2.avi
  176. to the correct format as described below:
  177. split_folder/
  178. ├── class1
  179. │ ├── clip1.avi
  180. """
  181. annotation_path = path.join(self.root, "annotations")
  182. if not check_integrity(path.join(annotation_path, f"{self.split}.csv")):
  183. download_url(self._ANNOTATION_URLS[self.num_classes].format(split=self.split), annotation_path)
  184. annotations = path.join(annotation_path, f"{self.split}.csv")
  185. file_fmtstr = "{ytid}_{start:06}_{end:06}.mp4"
  186. with open(annotations) as csvfile:
  187. reader = csv.DictReader(csvfile)
  188. for row in reader:
  189. f = file_fmtstr.format(
  190. ytid=row["youtube_id"],
  191. start=int(row["time_start"]),
  192. end=int(row["time_end"]),
  193. )
  194. label = row["label"].replace(" ", "_").replace("'", "").replace("(", "").replace(")", "")
  195. os.makedirs(path.join(self.split_folder, label), exist_ok=True)
  196. downloaded_file = path.join(self.split_folder, f)
  197. if path.isfile(downloaded_file):
  198. os.replace(
  199. downloaded_file,
  200. path.join(self.split_folder, label, f),
  201. )
  202. @property
  203. def metadata(self) -> Dict[str, Any]:
  204. return self.video_clips.metadata
  205. def __len__(self) -> int:
  206. return self.video_clips.num_clips()
  207. def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
  208. video, audio, info, video_idx = self.video_clips.get_clip(idx)
  209. label = self.samples[video_idx][1]
  210. if self.transform is not None:
  211. video = self.transform(video)
  212. return video, audio, label
  213. class Kinetics400(Kinetics):
  214. """
  215. `Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
  216. dataset.
  217. .. warning::
  218. This class was deprecated in ``0.12`` and will be removed in ``0.14``. Please use
  219. ``Kinetics(..., num_classes='400')`` instead.
  220. Kinetics-400 is an action recognition video dataset.
  221. This dataset consider every video as a collection of video clips of fixed size, specified
  222. by ``frames_per_clip``, where the step in frames between each clip is given by
  223. ``step_between_clips``.
  224. To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
  225. and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
  226. elements will come from video 1, and the next three elements from video 2.
  227. Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
  228. frames in a video might be present.
  229. Internally, it uses a VideoClips object to handle clip creation.
  230. Args:
  231. root (string): Root directory of the Kinetics-400 Dataset. Should be structured as follows:
  232. .. code::
  233. root/
  234. ├── class1
  235. │ ├── clip1.avi
  236. │ ├── clip2.avi
  237. │ ├── clip3.mp4
  238. │ └── ...
  239. └── class2
  240. ├── clipx.avi
  241. └── ...
  242. frames_per_clip (int): number of frames in a clip
  243. step_between_clips (int): number of frames between each clip
  244. transform (callable, optional): A function/transform that takes in a TxHxWxC video
  245. and returns a transformed version.
  246. Returns:
  247. tuple: A 3-tuple with the following entries:
  248. - video (Tensor[T, H, W, C]): the `T` video frames
  249. - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
  250. and `L` is the number of points
  251. - label (int): class of the video clip
  252. """
  253. def __init__(
  254. self,
  255. root: str,
  256. frames_per_clip: int,
  257. num_classes: Any = None,
  258. split: Any = None,
  259. download: Any = None,
  260. num_download_workers: Any = None,
  261. **kwargs: Any,
  262. ) -> None:
  263. warnings.warn(
  264. "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14."
  265. "Please use Kinetics(..., num_classes='400') instead."
  266. "Note that Kinetics(..., num_classes='400') returns video in a Tensor[T, C, H, W] format."
  267. )
  268. if any(value is not None for value in (num_classes, split, download, num_download_workers)):
  269. raise RuntimeError(
  270. "Usage of 'num_classes', 'split', 'download', or 'num_download_workers' is not supported in "
  271. "Kinetics400. Please use Kinetics instead."
  272. )
  273. super().__init__(
  274. root=root,
  275. frames_per_clip=frames_per_clip,
  276. _legacy=True,
  277. **kwargs,
  278. )