High-performance heterogeneous embedded platforms allow offloading of parallel workloads to an integrated accelerator, such as General Purpose-Graphic Processing Units (GP-GPUs). A time-predictable characterization of task submission is a must in real-time applications. We provide a profiler of the time spent by the CPU for submitting stereotypical GP-GPU workload shaped as a Deep Neural Network of parameterized complexity. The submission is performed using the latest API available: NVIDIA CUDA, including its various techniques, and Vulkan. Complete automation for the test on Jetson Xavier is also provided by scripts that install software dependencies, run the experiments, and collect results in a PDF report.
@Article{cavicchioli_et_al:DARTS.5.1.4, author = {Cavicchioli, Roberto and Capodieci, Nicola and Solieri, Marco and Bertogna, Marko}, title = {{API Comparison of CPU-To-GPU Command Offloading Latency on Embedded Platforms}}, pages = {4:1--4:3}, journal = {Dagstuhl Artifacts Series}, ISSN = {2509-8195}, year = {2019}, volume = {5}, number = {1}, editor = {Cavicchioli, Roberto and Capodieci, Nicola and Solieri, Marco and Bertogna, Marko}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/DARTS.5.1.4}, URN = {urn:nbn:de:0030-drops-107322}, doi = {10.4230/DARTS.5.1.4}, annote = {Keywords: GPU, Applications, Heterogeneus systems} }
3978b2398eab0687e51009e681c0ada9
(Get MD5 Sum)
Feedback for Dagstuhl Publishing