@inproceedings{ef829dbc7ee044f1aad51cb15941e7a3,
title = "Benchmarking the NVIDIA V100 GPU and Tensor Cores",
abstract = "The V100 GPU is the newest server-grade GPU produced by NVIDIA and introduces a number of new hardware and API features. This paper details the results of benchmarking the V100 GPU and demonstrates that it is a significant generational improvement, increasing memory bandwidth, cache bandwidth, and reducing latency. A major new addition is the Tensor core units, which have been marketed as deep learning acceleration features that enable the computation of a 4 × 4 × 4 half precision matrix-multiply-accumulate operation in a single clock cycle. This paper confirms that the Tensor cores offer considerable performance gains for half precision general matrix multiplication; however, programming them requires fine control of the memory hierarchy that is typically unnecessary for other applications.",
author = "Matt Martineau and Patrick Atkinson and Simon McIntosh-Smith",
year = "2019",
month = jan,
day = "1",
doi = "10.1007/978-3-030-10549-5_35",
language = "English",
isbn = "9783030105488",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "444--455",
editor = "Gabriele Mencagli and Heras, {Dora B.}",
booktitle = "Euro-Par 2018",
address = "Germany",
note = "24th International Conference on Parallel and Distributed Computing, Euro-Par 2018 ; Conference date: 27-08-2018 Through 28-08-2018",
}