,
Georgios Alexandris
,
Alexis Maras
,
Panagiotis Chaidos
,
Manil Dev Gomony
,
Henk Corporaal
,
Dimitrios Soudris
,
Sotirios Xydis
Creative Commons Attribution 4.0 International license
Since the emergence of transformer-based models, the computational demands for Large Language Model (LLM) inference have been increasing exponentially, primarily due to their compounding parameter sizes, their structural complexity, and the use of non-linear functions. This tendency leads to the necessity of deploying them on low-power edge devices and DNN accelerators, to fuel next-generation agentic AI systems. Coarse-Grained Reconfigurable Architectures (CGRAs) have proven to be a compelling paradigm for edge acceleration, combining the programmability of general-purpose platforms with the high performance and energy efficiency associated with ASICs. In this work, we introduce an end-to-end performance modeling and mapping framework for LLM inference on heterogeneous CGRAs. Our methodology enables rapid exploration of the micro-architectural design space parameters, i.e., the number of processing elements, vector sizes, and memory configurations, by providing an accurate, explainable, and analytical CGRA performance modeling methodology, with an average cycle error of 0.9%. Architecturally, we build upon R-Blocks, a heterogeneous CGRA platform, and extend it to support floating-point arithmetic operations as well as a full-stack compilation and mapping flow for both full (FP32) and quantized (INT8) Llama2 models. The proposed methodology, evaluated on a 22nm technology node, achieves superior peak performance per Watt compared to related works such as REVAMP and CFEACT (1.8× and 2.8× respectively).
@InProceedings{kefallinos_et_al:OASIcs.PARMA-DITAM.2026.8,
author = {Kefallinos, Dionysios and Alexandris, Georgios and Maras, Alexis and Chaidos, Panagiotis and Gomony, Manil Dev and Corporaal, Henk and Soudris, Dimitrios and Xydis, Sotirios},
title = {{Performance Modeling \& Mapping of LLM Inference on Heterogeneous Vectorized CGRAs}},
booktitle = {17th Workshop on Parallel Programming and Run-Time Management Techniques for Many-Core Architectures and 15th Workshop on Design Tools and Architectures for Multicore Embedded Computing Platforms (PARMA-DITAM 2026)},
pages = {8:1--8:14},
series = {Open Access Series in Informatics (OASIcs)},
ISBN = {978-3-95977-416-1},
ISSN = {2190-6807},
year = {2026},
volume = {141},
editor = {Baroffio, Davide and Busia, Paola and Denisov, Lev and Shukla, Nitin},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
address = {Dagstuhl, Germany},
URL = {https://drops.dagstuhl.de/entities/document/10.4230/OASIcs.PARMA-DITAM.2026.8},
URN = {urn:nbn:de:0030-drops-256752},
doi = {10.4230/OASIcs.PARMA-DITAM.2026.8},
annote = {Keywords: Edge AI, LLM, CGRA, Heterogeneous Architectures, Performance Modeling, Hardware Acceleration, Low Power Computing}
}