Creative Commons Attribution 4.0 International license
Cycle-accurate GPGPU simulators like GPGPU-Sim provide invaluable insights for hardware architecture research but suffer from extremely long runtimes, hindering research productivity. This paper addresses this critical bottleneck by proposing a strategy to accelerate GPGPU-Sim. We first perform a holistic profiling analysis across diverse GPGPU benchmarks to identify the primary performance bottleneck, pinpointing the SIMT-Core cluster execution within the CORE-clock cycle. Based on this, we implement a parallelization scheme that strategically targets this hotspot, utilizing a thread pool to manage concurrent execution of SIMT-Core clusters. Our approach prioritizes minimal modifications to the existing GPGPU-Sim codebase to ensure long-term maintainability. Evaluation of a simulated NVIDIA H100 model demonstrates an average simulation wall-time speedup of 3.58× with 8 worker threads, and a maximum up to 4.38×, while incurring a maximum cycle count error of 3.22%, with some other benchmarks exhibiting no error at all.
@InProceedings{sachs_et_al:OASIcs.PARMA-DITAM.2026.6,
author = {Sachs, Jakob and L\"{u}hnen, Tim and Lal, Sohan},
title = {{Accelerating GPGPU Simulation by Strategically Parallelizing the Compute Bottleneck}},
booktitle = {17th Workshop on Parallel Programming and Run-Time Management Techniques for Many-Core Architectures and 15th Workshop on Design Tools and Architectures for Multicore Embedded Computing Platforms (PARMA-DITAM 2026)},
pages = {6:1--6:13},
series = {Open Access Series in Informatics (OASIcs)},
ISBN = {978-3-95977-416-1},
ISSN = {2190-6807},
year = {2026},
volume = {141},
editor = {Baroffio, Davide and Busia, Paola and Denisov, Lev and Shukla, Nitin},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
address = {Dagstuhl, Germany},
URL = {https://drops.dagstuhl.de/entities/document/10.4230/OASIcs.PARMA-DITAM.2026.6},
URN = {urn:nbn:de:0030-drops-256736},
doi = {10.4230/OASIcs.PARMA-DITAM.2026.6},
annote = {Keywords: GPGPU, CUDA, Simulation, Computer Architecture, GPGPU-Sim, Parallel Simulation, Cycle-Accurate Simulation, Thread Pool}
}