/*------------------------------------------------------------------------------------------------*/
/* Copyright (C) by the DBCSR developers group - All rights reserved                              */
/* This file is part of the DBCSR library.                                                        */
/*                                                                                                */
/* For information on the license, see the LICENSE file.                                          */
/* For further information please visit https://dbcsr.cp2k.org                                    */
/* SPDX-License-Identifier: BSD-3-Clause                                                          */
/*------------------------------------------------------------------------------------------------*/
#if defined(__OPENCL)
#  include "opencl_libsmm.h"
/* Header opencl_kernels.h is generated by the build system using acc_opencl.sh */
#  include "opencl_kernels.h"
#  include "../../acc_bench.h"
#  include <ctype.h>

#  if !defined(OPENCL_KERNELS_SOURCE_TRANSPOSE)
#    error "OpenCL transpose-kernel code not found!"
#  endif
#  if !defined(OPENCL_KERNELS_SOURCE_MULTIPLY)
#    error "OpenCL SMM-kernel code not found!"
#  endif

#  if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
#    define OPENCL_LIBSMM_DESCINIT(BLOB, PREC, M, N, K, LDA, LDB, LDC, FLAGS, PREFETCH) \
      libxsmm_gemm_descriptor_init(BLOB, PREC, PREC, PREC, PREC, M, N, K, LDA, LDB, LDC, FLAGS, PREFETCH)
#  else
#    define OPENCL_LIBSMM_DESCINIT(BLOB, PREC, M, N, K, LDA, LDB, LDC, FLAGS, PREFETCH) \
      libxsmm_gemm_descriptor_dinit(BLOB, PREC, M, N, K, LDA, LDB, LDC, 1.0, 1.0, FLAGS, PREFETCH)
#  endif

#  if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && defined(OPENCL_LIBSMM_VALIDATE) && \
    (1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE)
#    define OPENCL_LIBSMM_VALIDATE_TRANS
#  endif
#  if !defined(OPENCL_LIBSMM_VALIDATE_SMM) && defined(OPENCL_LIBSMM_VALIDATE)
#    define OPENCL_LIBSMM_VALIDATE_SMM
#  endif
#  if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && defined(OPENCL_LIBSMM_VALIDATE) && 1
#    define OPENCL_LIBSMM_VALIDATE_EXIT
#  endif
#  if !defined(OPENCL_LIBSMM_KERNELNAME_TRANS)
#    define OPENCL_LIBSMM_KERNELNAME_TRANS "trans"
#  endif
#  if !defined(OPENCL_LIBSMM_KERNELNAME_SMM)
#    define OPENCL_LIBSMM_KERNELNAME_SMM "smm"
#  endif
#  if !defined(OPENCL_LIBSMM_NLOCKS_TRANS)
#    define OPENCL_LIBSMM_NLOCKS_TRANS 16
#  endif
#  if !defined(OPENCL_LIBSMM_NLOCKS_SMM)
#    define OPENCL_LIBSMM_NLOCKS_SMM 16
#  endif
#  if !defined(OPENCL_LIBSMM_CMEM) && 1
#    define OPENCL_LIBSMM_CMEM
#  endif
/* default: decompose C-matrix into column-vectors (BMxBN) */
#  if !defined(OPENCL_LIBSMM_DEFAULT_BM)
#    define OPENCL_LIBSMM_DEFAULT_BM INT_MAX
#  endif
#  if !defined(OPENCL_LIBSMM_DEFAULT_BN)
#    define OPENCL_LIBSMM_DEFAULT_BN 1
#  endif
#  if !defined(OPENCL_LIBSMM_DEFAULT_BK)
#    if 1
#      define OPENCL_LIBSMM_DEFAULT_BK INT_MAX
#    else
#      define OPENCL_LIBSMM_DEFAULT_BK 1
#    endif
#  endif
#  if !defined(OPENCL_LIBSMM_DEFAULT_BS)
#    define OPENCL_LIBSMM_DEFAULT_BS 8
#  endif
#  if !defined(OPENCL_LIBSMM_BS_MIN) && 1
#    define OPENCL_LIBSMM_BS_MIN 32
#  endif
#  if !defined(OPENCL_LIBSMM_SMM_S)
#    define OPENCL_LIBSMM_SMM_S 64
#  endif
#  if !defined(OPENCL_LIBSMM_VMIN)
#    define OPENCL_LIBSMM_VMIN 8
#  endif

/* approximate arithmetic intensity for SMMs like C += Ai * Bi (beta=1) */
#  define OPENCL_LIBSMM_AI(M, N, K, TYPESIZE) ((2.0 * (M) * (N) * (K)) / ((TYPESIZE) * (K) * ((M) + (N))))
/* determine type-size of a given type-ID */
#  define OPENCL_LIBSMM_TYPESIZE(TYPEID) \
    (dbcsr_type_real_8 == (TYPEID) ? ((int)sizeof(double)) : (dbcsr_type_real_4 == (TYPEID) ? ((int)sizeof(float)) : 0 /*unknown*/))


#  if defined(__cplusplus)
extern "C" {
#  endif

/* track initialization status of LIBSMM */
int opencl_libsmm_initialized;


int opencl_libsmm_use_cmem(cl_device_id device) {
#  if defined(OPENCL_LIBSMM_CMEM)
  int result = EXIT_SUCCESS;
  cl_ulong size_maxalloc = 1, size_maxcmem = 0;
  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size_maxalloc, NULL),
    "retrieve maximum size of memory allocation", result);
  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &size_maxcmem, NULL),
    "retrieve maximum size of constant buffer", result);
  return (EXIT_SUCCESS == result ? (size_maxalloc <= size_maxcmem ? EXIT_SUCCESS : EXIT_FAILURE) : result);
#  else
  LIBXSMM_UNUSED(device);
  return EXIT_FAILURE;
#  endif
}


#  if defined(OPENCL_LIBSMM_VALIDATE) && (0 != OPENCL_LIBSMM_VALIDATE)
void opencl_libsmm_print_matrix(FILE* ostream, const char* label, libsmm_acc_data_t type, const void* mat, int m, int n) {
  int i, j;
  const char* const s = (NULL != label ? label : "");
  const int len = (int)strlen(s);
  for (i = 0; i < m; ++i) {
    if (0 < i) {
      fprintf(ostream, "%*s", len, " ");
    }
    else {
      fprintf(ostream, "%s", s);
    }
    for (j = 0; j < n; ++j) {
      switch (type) {
        case dbcsr_type_real_8: fprintf(ostream, "%.2f ", ((const double*)mat)[i * n + j]); break;
        case dbcsr_type_real_4: fprintf(ostream, "%.2f ", ((const float*)mat)[i * n + j]); break;
        default: fprintf(ostream, "? ");
      }
    }
    fprintf(ostream, "\n");
  }
}
#  endif


int opencl_libsmm_write_trans_params(FILE* stream, int only_key, const opencl_libsmm_transkey_t* key,
  const opencl_libsmm_trans_t* config, const char* delim, const char* begin, const char* close) {
  int result = 0;
  if (NULL != stream) {
    const char d = (NULL == delim ? *ACC_OPENCL_DELIMS : *delim);
    if (NULL != key || 0 == only_key) result += fprintf(stream, "%c", NULL == begin ? '{' : *begin);
    if (NULL != config) {
      if (NULL != key) {
        result += fprintf(stream, "%i%c%i%c%i", (int)key->type, d, key->m, d, key->n);
        /*if (0 == only_key) result += fprintf(stream, "%c", d);*/
      }
    }
    else {
      if (NULL != key) {
        result += fprintf(stream, "t%cm%cn", d, d);
        /*if (0 == only_key) result += fprintf(stream, "%c", d);*/
      }
    }
    if (NULL != key || 0 == only_key) result += fprintf(stream, "%c", NULL == close ? '}' : *close);
  }
  else result = -1;
  assert(0 < result);
  return result;
}


int opencl_libsmm_write_smm_params(FILE* stream, int only_key, const opencl_libsmm_smmkey_t* key, const opencl_libsmm_smm_t* config,
  const char* delim, const char* begin, const char* close) {
  int result = 0;
  if (NULL != stream) {
    const char d = (NULL == delim ? *ACC_OPENCL_DELIMS : *delim);
    if (NULL != key || 0 == only_key) result += fprintf(stream, "%c", NULL == begin ? '{' : *begin);
    if (NULL != config) {
      if (NULL != key) {
        result += fprintf(stream, "%i%c%i%c%i%c%i", (int)key->type, d, key->m, d, key->n, d, key->k);
        if (0 == only_key) result += fprintf(stream, "%c ", d);
      }
      if (0 == only_key) {
        result += fprintf(stream, "%i%c%i%c%i%c%i%c %i%c%i%c %i%c%i%c%i%c %i%c%i%c %i%c%i%c%i%c%i", config->bs, d, config->bm, d,
          config->bn, d, config->bk, d, config->ws, d, config->wg, d, config->lu, d, config->nz, d, config->al, d, config->tb, d,
          config->tc, d, config->ap, d, config->aa, d, config->ab, d, config->ac);
        if (0 != config->flags) result += fprintf(stream, "%c %i", d, config->flags);
      }
    }
    else {
      if (NULL != key) {
        result += fprintf(stream, "t%cm%cn%ck", d, d, d);
        if (0 == only_key) result += fprintf(stream, "%c ", d);
      }
      if (0 == only_key) {
        result += fprintf(
          stream, "bs%cbm%cbn%cbk%c ws%cwg%c lu%cnz%cal%c tb%ctc%c ap%caa%cab%cac", d, d, d, d, d, d, d, d, d, d, d, d, d, d);
      }
    }
    if (NULL != key || 0 == only_key) result += fprintf(stream, "%c", NULL == close ? '}' : *close);
  }
  else result = -1;
  assert(0 < result);
  return result;
}


int opencl_libsmm_read_smm_params(char* parambuf, opencl_libsmm_smmkey_t* key, opencl_libsmm_smm_t* value,
  opencl_libsmm_perfest_t* perfest, char* device, int* key_ok) {
  const char* const end = parambuf + strlen(parambuf); /* before strtok */
  char* s = strtok(parambuf, ACC_OPENCL_DELIMS);
  const int opt_consumed = (NULL != perfest ? 2 : 0) + (NULL != device ? 1 : 0);
  int result = EXIT_SUCCESS, i = 0, ivalue, consumed = 0, c = 0, max_consumed = opt_consumed + 19;
  double gflops;
  assert(NULL != key && NULL != value);
  LIBXSMM_MEMZERO127(key); /* potentially heterogeneous key-data (alignment gaps) */
  memset(value, 0, sizeof(opencl_libsmm_smm_t));
  for (; NULL != s;
       ++i, s = (c != consumed ? ((s + 1) < end ? strtok((s + 1) + strlen(s), ACC_OPENCL_DELIMS) : NULL) : s), c = consumed)
  {
    switch (i) {
      case 0:
        if (NULL != device && 1 == sscanf(s, "%[^" ACC_OPENCL_DELIMS "]", device)) {
          ++consumed; /* optional device name */
        }
        break;
      case 1:
        if (1 == sscanf(s, "%i", &ivalue)) {
          key->type = (libsmm_acc_data_t)ivalue;
          ++consumed;
        }
        break;
      case 2:
        if (1 == sscanf(s, "%i", &ivalue) && 0 < ivalue) {
          key->m = ivalue;
          ++consumed;
        }
        break;
      case 3:
        if (1 == sscanf(s, "%i", &ivalue) && 0 < ivalue) {
          key->n = ivalue;
          ++consumed;
        }
        break;
      case 4:
        if (1 == sscanf(s, "%i", &ivalue) && 0 < ivalue) {
          key->k = ivalue;
          ++consumed;
        }
        break;
      case 5:
        if (NULL != perfest && 1 == sscanf(s, "%i", &ivalue)) {
          value->s = ivalue;
          ++consumed; /* optional "S" param */
        }
        break;
      case 6:
        if (NULL != perfest && 1 == sscanf(s, "%lf", &gflops) && 0 <= gflops) {
          value->gflops = gflops;
          ++consumed; /* optional "GFLOPS" param */
        }
        break;
      case 7:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->bs = ivalue;
          ++consumed;
        }
        break;
      case 8:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->bm = ivalue;
          ++consumed;
        }
        break;
      case 9:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->bn = ivalue;
          ++consumed;
        }
        break;
      case 10:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->bk = ivalue;
          ++consumed;
        }
        break;
      case 11:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->ws = ivalue;
          ++consumed;
        }
        break;
      case 12:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->wg = ivalue;
          ++consumed;
        }
        break;
      case 13:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->lu = ivalue;
          ++consumed;
        }
        break;
      case 14:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->nz = ivalue;
          ++consumed;
        }
        break;
      case 15:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->al = ivalue;
          ++consumed;
        }
        break;
      case 16:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->tb = ivalue;
          ++consumed;
        }
        break;
      case 17:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->tc = ivalue;
          ++consumed;
        }
        break;
      case 18:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->ap = ivalue;
          ++consumed;
        }
        break;
      case 19:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->aa = ivalue;
          ++consumed;
        }
        break;
      case 20:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->ab = ivalue;
          ++consumed;
        }
        break;
      case 21:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->ac = ivalue;
          ++consumed;
        }
        break;
      case 22:
        if (1 == sscanf(s, "%i", &ivalue)) {
          value->flags = ivalue;
          ++max_consumed;
          ++consumed;
        }
        break;
      default: s = NULL; /* break */
    }
  }
  if (max_consumed == consumed) {
    switch (key->type) {
      case dbcsr_type_real_8:
        if (NULL != perfest) {
          const double ratio = gflops / OPENCL_LIBSMM_AI(key->m, key->n, key->k, sizeof(double));
#  if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
          libxsmm_kahan_sum(log(ratio), &perfest->gf_ai_dratio_sumlog, &perfest->gf_ai_dratio_kahan);
#  else
          perfest->gf_ai_dratio_sumlog += log(ratio);
#  endif
          if (perfest->gf_ai_dratio_max < ratio) perfest->gf_ai_dratio_max = ratio;
          ++perfest->dcount;
        }
        break;
      case dbcsr_type_real_4:
        if (NULL != perfest) {
          const double ratio = gflops / OPENCL_LIBSMM_AI(key->m, key->n, key->k, sizeof(float));
#  if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
          libxsmm_kahan_sum(log(ratio), &perfest->gf_ai_sratio_sumlog, &perfest->gf_ai_sratio_kahan);
#  else
          perfest->gf_ai_sratio_sumlog += log(ratio);
#  endif
          if (perfest->gf_ai_sratio_max < ratio) perfest->gf_ai_sratio_max = ratio;
          ++perfest->scount;
        }
        break;
      default: result = EXIT_FAILURE;
    }
  }
  else result = EXIT_FAILURE;
  if (NULL != key_ok && 4 <= consumed) *key_ok = 1;
  return result;
}


int libsmm_acc_init(void) {
#  if defined(_OPENMP)
  /* initialization/finalization is not meant to be thread-safe */
  int result = ((0 == omp_in_parallel() || /*main*/ 0 == omp_get_thread_num()) ? EXIT_SUCCESS : EXIT_FAILURE);
#  else
  int result = EXIT_SUCCESS;
#  endif
  /* multiple calls to libsmm_acc_init are not considered as an error */
  if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&opencl_libsmm_initialized, 1, ACC_OPENCL_ATOMIC)) {
#  if !defined(__DBCSR_ACC)
    /* DBCSR shall call c_dbcsr_acc_init as well as libsmm_acc_init (since both interfaces are used).
     * Also, libsmm_acc_init may privately call c_dbcsr_acc_init (as it depends on the ACC interface).
     * The implementation of c_dbcsr_acc_init should hence be safe against "over initialization".
     * However, DBCSR only calls c_dbcsr_acc_init (and expects an implicit libsmm_acc_init).
     */
    if (EXIT_SUCCESS == result) result = c_dbcsr_acc_init();
#  endif
    libxsmm_init();
    if (EXIT_SUCCESS == result) {
      opencl_libsmm_perfest_t perfest;
      char* const env_params = getenv("OPENCL_LIBSMM_SMM_PARAMS");
      memset(&perfest, 0, sizeof(perfest));
      if (NULL == env_params || '0' != *env_params) {
        char buffer[ACC_OPENCL_BUFFERSIZE], bufname[ACC_OPENCL_BUFFERSIZE];
#  if defined(OPENCL_KERNELS_DEVICES)
        const int ndevices_params = (int)(sizeof(OPENCL_KERNELS_DEVICES) / sizeof(*OPENCL_KERNELS_DEVICES));
#  endif
        opencl_libsmm_smm_t config;
        opencl_libsmm_smmkey_t key, key_direct;
        int key_direct_skip = 0, ntuned = 0;
        if (NULL != env_params && '\0' != *env_params) { /* filename */
          FILE* const file = fopen(env_params, "r");
          if (NULL != file) {
            /* consume first line, check for device entry, and skip CSV header */
            if (NULL != fgets(buffer, ACC_OPENCL_BUFFERSIZE, file)) {
              char* const device = (NULL != LIBXSMM_STRISTR(buffer, "device") ? bufname : NULL);
              opencl_libsmm_perfest_t* const gflops = (NULL != LIBXSMM_STRISTR(buffer, "gflops") ? &perfest : NULL);
              while (NULL != fgets(buffer, ACC_OPENCL_BUFFERSIZE, file)) { /* read params from CSV-file */
                if (EXIT_SUCCESS == opencl_libsmm_read_smm_params(buffer, &key, &config, gflops, device, NULL /*key_ok*/)) {
                  opencl_libsmm_smm_t* config_init;
                  c_dbcsr_acc_opencl_config.devmatch = 0; /* disable device-match */
                  key.devuid = 0;
                  config_init = (opencl_libsmm_smm_t*)libxsmm_xdispatch(&key, sizeof(key));
                  if (NULL == config_init) {
                    if (NULL == libxsmm_xregister(&key, sizeof(key), sizeof(config), &config)) {
                      result = EXIT_FAILURE;
                      break;
                    }
                    else ++ntuned;
                  }
                  else if (config_init->gflops < config.gflops) { /* update */
                    memcpy(config_init, &config, sizeof(config));
                  }
                }
                else {
                  if (0 != c_dbcsr_acc_opencl_config.verbosity) {
                    fprintf(stderr, "WARN LIBSMM: failed to load tuned parameters from CSV-file!\n");
                  }
                  break; /* invalid entry */
                }
              }
            }
            else { /* invalid header */
              result = EXIT_FAILURE;
            }
            fclose(file);
          }
          else if (EXIT_SUCCESS == opencl_libsmm_read_smm_params(
                                     env_params, &key_direct, &config, NULL /*perfest*/, NULL /*device*/, &key_direct_skip))
          { /* try OPENCL_LIBSMM_SMM_PARAMS as string of kernel parameters (not device-specific) */
            assert(0 == key_direct.devuid && 0 != key_direct_skip);
            if (NULL != libxsmm_xregister(&key_direct, sizeof(key_direct), sizeof(config), &config)) {
              c_dbcsr_acc_opencl_config.devmatch = 0; /* disable device-match */
              ntuned = 1;
            }
            else result = EXIT_FAILURE;
          }
          else if (0 == key_direct_skip && 0 != c_dbcsr_acc_opencl_config.verbosity) { /* soft-error */
            fprintf(stderr, "WARN LIBSMM: failed to open parameter file!\n");
          }
        }
#  if defined(OPENCL_KERNELS_PARAMS_SMM) && defined(OPENCL_KERNELS_DEVICES)
        if (EXIT_SUCCESS == result && (0 == ntuned || 0 != key_direct_skip)) {
          const char *line = OPENCL_KERNELS_PARAMS_SMM, *next;
#    if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
          int active_match = -1;
          if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(c_dbcsr_acc_opencl_config.device.id, bufname, ACC_OPENCL_BUFFERSIZE,
                                NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1))
          { /* determine best-matching parameters based on name of device */
            int i = 0, best = 0;
            for (; i < ndevices_params; ++i) {
              const int score = libxsmm_strimatch(bufname, OPENCL_KERNELS_DEVICES[i], NULL);
              unsigned int uid;
              if (best < score ||
                  ((best == score) &&
                    EXIT_SUCCESS == c_dbcsr_acc_opencl_device_uid(NULL /*device*/, OPENCL_KERNELS_DEVICES[i], &uid) &&
                    uid == c_dbcsr_acc_opencl_config.device.uid))
              {
                active_match = i;
                best = score;
              }
            }
          }
#    endif
          do {
            next = strchr(line, '\n');
            if (NULL != next && next < (line + ACC_OPENCL_BUFFERSIZE)) {
              const int len = next - line;
              memcpy(buffer, line, len);
              buffer[len] = '\0';
              if (EXIT_SUCCESS == opencl_libsmm_read_smm_params(/* read params from embedded params */
                                    buffer, &key, &config, &perfest, bufname /*consume name/id*/, NULL /*key_ok*/))
              {
                if (0 == key_direct_skip || 0 != memcmp(&key_direct, &key, (const char*)&key.k - (const char*)&key)) {
                  opencl_libsmm_smm_t* config_init;
                  const int i = atoi(bufname);
                  if (0 >= ndevices_params || 0 == c_dbcsr_acc_opencl_config.devmatch || 0 > i || ndevices_params <= i ||
                      EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(NULL /*device*/, OPENCL_KERNELS_DEVICES[i], &key.devuid))
                  {
                    key.devuid = 0;
                  }
                  config_init = (opencl_libsmm_smm_t*)libxsmm_xdispatch(&key, sizeof(key));
                  if (NULL == config_init) {
                    if (NULL == libxsmm_xregister(&key, sizeof(key), sizeof(config), &config)) {
                      result = EXIT_FAILURE;
                      break;
                    }
                    else ++ntuned;
                  }
                  else if (config_init->gflops < config.gflops) { /* update */
                    memcpy(config_init, &config, sizeof(config));
                  }
#    if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
                  if (active_match == i && c_dbcsr_acc_opencl_config.device.uid != key.devuid) {
                    key.devuid = c_dbcsr_acc_opencl_config.device.uid;
                    config_init = (opencl_libsmm_smm_t*)libxsmm_xdispatch(&key, sizeof(key));
                    if (NULL == config_init && NULL != libxsmm_xregister(&key, sizeof(key), sizeof(config), &config)) {
                      static int info = 0;
                      if (0 == info && 0 != c_dbcsr_acc_opencl_config.verbosity &&
                          EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(c_dbcsr_acc_opencl_config.device.id, bufname,
                                            ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 0))
                      {
                        fprintf(stderr, "INFO ACC/LIBSMM: PARAMS of \"%s\" used for \"%s\"\n", OPENCL_KERNELS_DEVICES[i], bufname);
                        info = 1;
                      }
                    }
                  }
#    endif
                }
              }
              else {
                if (0 != c_dbcsr_acc_opencl_config.verbosity) {
                  fprintf(stderr, "WARN LIBSMM: failed to load embedded parameters!\n");
                }
                break;
              }
              line = ++next;
            }
          } while (NULL != next);
        }
#  endif
#  if defined(OPENCL_KERNELS_DEVICES)
        if (EXIT_SUCCESS == result && 0 != ntuned &&
            (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity))
        {
          fprintf(stderr, "INFO ACC/LIBSMM: PARAMS in %i set%s loaded targeting ", ntuned, 1 != ntuned ? "s" : "");
          if (0 != c_dbcsr_acc_opencl_config.devmatch) {
            fprintf(stderr, "%i device%s\n", ndevices_params, 1 != ndevices_params ? "s" : "");
            if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
              unsigned int i = 0;
              for (; i < (unsigned int)ndevices_params; ++i) {
                fprintf(stderr, "INFO ACC/LIBSMM: PARAMS -> \"%s\"\n", OPENCL_KERNELS_DEVICES[i]);
              }
            }
          }
          else fprintf(stderr, "any device\n");
        }
#  endif
      }
#  if defined(OPENCL_LIBSMM_VALIDATE)
      c_dbcsr_acc_opencl_config.xhints &= ~1; /* disable USM */
#  endif
    }
  }
  ACC_OPENCL_RETURN(result);
}


int libsmm_acc_finalize(void) {
  /* Routine libsmm_acc_init is called in master thread inside of parallel region
   * However, libsmm_acc_finalize is indirectly called (c_dbcsr_acc_finalize)
   * inside of a parallel region (not just the master thread).
   */
#  if defined(_OPENMP)
  /* initialization/finalization is not meant to be thread-safe */
  int result = ((0 == omp_in_parallel() || /*main*/ 0 == omp_get_thread_num()) ? EXIT_SUCCESS : EXIT_FAILURE);
#  else
  int result = EXIT_SUCCESS;
#  endif
  /* multiple calls to libsmm_acc_finalize are not considered as an error */
  if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&opencl_libsmm_initialized, 1, ACC_OPENCL_ATOMIC)) {
#  if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
    char fname[ACC_OPENCL_MAXSTRLEN];
    const void* regentry = libxsmm_get_registry_begin(LIBXSMM_KERNEL_KIND_USER, NULL /*key*/);
    for (; NULL != regentry; regentry = libxsmm_get_registry_next(regentry, NULL /*key*/)) {
      /* opencl_libsmm_trans_t/opencl_libsmm_smm_t carry cl_kernel as 1st data member */
      cl_kernel kernel = *(const cl_kernel*)regentry;
      if (NULL == kernel) kernel = ((const opencl_libsmm_smm_t*)regentry)->kernel[1];
      if (NULL != kernel) { /* only consider user-entry if clGetKernelInfo succeeded */
        int result_entry = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(fname), fname, NULL);
        if (EXIT_SUCCESS == result_entry) {
          if (NULL != strstr(fname, OPENCL_LIBSMM_KERNELNAME_TRANS)) { /* trans-kernel */
            result_entry = clReleaseKernel(kernel);
          }
          else if (NULL != strstr(fname, OPENCL_LIBSMM_KERNELNAME_SMM)) { /* SMM-kernel */
            result_entry = clReleaseKernel(kernel);
            if (EXIT_SUCCESS == result_entry && kernel != ((const opencl_libsmm_smm_t*)regentry)->kernel[1]) {
              kernel = ((const opencl_libsmm_smm_t*)regentry)->kernel[1]; /* release 2nd kernel */
              if (NULL != kernel) result_entry = clReleaseKernel(kernel);
            }
          }
          if (EXIT_SUCCESS != result_entry) result = result_entry;
        }
      }
    }
#  endif
#  if !defined(__DBCSR_ACC)
    /* DBCSR shall call c_dbcsr_acc_init as well as libsmm_acc_init (since both interfaces are used).
     * Also, libsmm_acc_init may privately call c_dbcsr_acc_init (as it depends on the ACC interface).
     * The implementation of c_dbcsr_acc_init should hence be safe against "over initialization".
     * However, DBCSR only calls c_dbcsr_acc_init (and expects an implicit libsmm_acc_init).
     */
    if (EXIT_SUCCESS == result) result = c_dbcsr_acc_finalize();
#  endif
    libxsmm_finalize();
  }
  /* c_dbcsr_acc_finalize is not called since it can be used independently */
  return result;
}


c_dbcsr_acc_bool_t libsmm_acc_is_thread_safe(void) {
  /* match DBCSR's threading level */
#  if defined(_OPENMP)
  return 1;
#  else
  return 0;
#  endif
}


int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, void* dev_data, libsmm_acc_data_t datatype, int m,
  int n, int max_kernel_dim, void* stream) {
  c_dbcsr_acc_opencl_info_memptr_t info_stack, info_mdata;
  int result = EXIT_SUCCESS;
  const int mn = m * n;
  assert((NULL != dev_trs_stack && NULL != stream && NULL != dev_data && 0 <= offset && 0 <= stack_size) || 0 == stack_size);
  result |= c_dbcsr_acc_opencl_info_devptr(&info_stack, dev_trs_stack, sizeof(int), NULL /*amount*/, NULL /*offset*/);
  result |= c_dbcsr_acc_opencl_info_devptr(&info_mdata, dev_data, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/);
  if (EXIT_SUCCESS == result &&
      (
#  if defined(OPENCL_LIBSMM_F64)
        dbcsr_type_real_8 == datatype
#  else
        0
#  endif
        ||
#  if defined(OPENCL_LIBSMM_F32)
        dbcsr_type_real_4 == datatype
#  else
        0
#  endif
        ) &&
      0 < stack_size && 1 < mn && m <= max_kernel_dim && n <= max_kernel_dim)
  {
    const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream);
    opencl_libsmm_trans_t* config;
    opencl_libsmm_transkey_t key;
#  if !defined(OPENCL_LIBSMM_VALIDATE_TRANS)
    double duration;
    const libxsmm_timer_tickint start = libxsmm_timer_tick();
#  endif
    LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data (alignment gaps) */
    key.type = datatype;
    key.m = m;
    key.n = n; /* initialize key */
    config = (opencl_libsmm_trans_t*)libxsmm_xdispatch(&key, sizeof(key));
    if (NULL == config) {
      char buffer[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE];
      char fname[ACC_OPENCL_MAXSTRLEN];
      int nchar = LIBXSMM_SNPRINTF(fname, sizeof(fname),
        /* kernel name are meant to be unambiguous (BLAS-typeprefix and kernelsize) */
        "x" OPENCL_LIBSMM_KERNELNAME_TRANS "%ix%i", m, n);
#  if defined(__DBCSR_ACC)
      int routine_handle;
      c_dbcsr_timeset(LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STRPTR, LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_LENPTR, &routine_handle);
#  endif
      if (0 < nchar && (int)sizeof(fname) > nchar) {
        const char *const env_cl = getenv("OPENCL_LIBSMM_TRANS_BUILDOPTS"), *const env_bm = getenv("OPENCL_LIBSMM_TRANS_BM");
        const char* const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(c_dbcsr_acc_opencl_config.device.id) ? "global"
                                                                                                              : "constant");
        const char* const param_format = "-DGLOBAL=%s -DINPLACE=%i -DFN=%s -DSM=%i -DSN=%i -DSWG=%i -DT=%s";
        const char *const env_inplace = getenv("OPENCL_LIBSMM_TRANS_INPLACE"), *tname = "";
#  if defined(OPENCL_LIBSMM_TRANS_INPLACE)
        const int inplace = ((m == n) && (NULL == env_inplace ? 1 : ('0' != *env_inplace)));
#  else
        const int inplace = ((m == n) && (NULL == env_inplace ? 0 : ('0' != *env_inplace)));
#  endif
        const int blockm = ((NULL == env_bm || '\0' == *env_bm) ? 0 : atoi(env_bm));
        const int bm = (0 >= blockm ? m : LIBXSMM_MIN(blockm, m));
        opencl_libsmm_trans_t new_config;
        memset(&new_config, 0, sizeof(new_config));
        switch (datatype) {
          case dbcsr_type_real_8: {
            tname = "char8"; /* double */
            fname[0] = 'd';
          } break;
          case dbcsr_type_real_4: {
            tname = "float";
            fname[0] = 's';
          } break;
          default: assert('\0' == *tname);
        }
        new_config.wgsize = LIBXSMM_MIN((size_t)((m == bm || 0 == (m % bm)) ? bm : m), c_dbcsr_acc_opencl_config.device.wgsize[0]);
        nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s", NULL == env_cl ? "" : env_cl);
        if (0 <= /*<*/ nchar && (int)sizeof(buffer) > nchar) {
          nchar = LIBXSMM_SNPRINTF(
            build_params, sizeof(build_params), param_format, cmem, inplace, fname, m, n, (int)new_config.wgsize, tname);
        }
        if ('\0' != *tname && 0 < nchar && (int)sizeof(build_params) > nchar) {
          result = c_dbcsr_acc_opencl_kernel(0 /*source_is_file*/, OPENCL_KERNELS_SOURCE_TRANSPOSE, fname, build_params, buffer,
            NULL /*try*/, NULL /*try_ok*/, NULL /*extnames*/, 0 /*num_exts*/, &new_config.kernel);
          if (EXIT_SUCCESS == result) {
            size_t wgsize_max;
            assert(NULL != new_config.kernel);
            result = clGetKernelWorkGroupInfo(
              new_config.kernel, c_dbcsr_acc_opencl_config.device.id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgsize_max, NULL);
            if (EXIT_SUCCESS == result) {
              assert(0 < wgsize_max);
              if (wgsize_max < new_config.wgsize) {
                new_config.wgsize = wgsize_max;
                nchar = LIBXSMM_SNPRINTF(
                  build_params, sizeof(build_params), param_format, cmem, inplace, fname, m, n, (int)new_config.wgsize, tname);
                if (0 < nchar && (int)sizeof(build_params) > nchar) {
                  result = c_dbcsr_acc_opencl_kernel(0 /*source_is_file*/, OPENCL_KERNELS_SOURCE_TRANSPOSE, fname, build_params,
                    buffer, NULL /*try*/, NULL /*try_ok*/, NULL /*extnames*/, 0 /*num_exts*/, &new_config.kernel);
                }
                else result = EXIT_FAILURE;
              }
              if (EXIT_SUCCESS == result) {
                config = (opencl_libsmm_trans_t*)libxsmm_xregister(&key, sizeof(key), sizeof(new_config), &new_config);
#  if !defined(OPENCL_LIBSMM_VALIDATE_TRANS)
                if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
                  LIBXSMM_STDIO_ACQUIRE();
                  duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
                  fprintf(stderr, "INFO ACC/LIBSMM: TRANS-kernel ");
                  opencl_libsmm_write_trans_params(
                    stderr, 0 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
                  fprintf(stderr, "=");
                  opencl_libsmm_write_trans_params(
                    stderr, 0 /*only_key*/, &key, config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
                  fprintf(stderr, " gen=%.1f ms\n", 1E3 * duration);
                  LIBXSMM_STDIO_RELEASE();
                }
#  endif
              }
            }
          }
        }
        else if (EXIT_SUCCESS == result) {
          result = EXIT_FAILURE;
        }
      }
      else {
        result = EXIT_FAILURE;
      }
#  if defined(__DBCSR_ACC)
      c_dbcsr_timestop(&routine_handle);
#  endif
    }
    assert((NULL != config && NULL != config->kernel && 0 < config->wgsize) || EXIT_SUCCESS != result);
    if (EXIT_SUCCESS == result) {
      cl_event event, *const perf_event = ((c_dbcsr_acc_opencl_timer_host == c_dbcsr_acc_opencl_config.timer ||
                                             (0 <= c_dbcsr_acc_opencl_config.verbosity && 2 >= c_dbcsr_acc_opencl_config.verbosity))
                                             ? NULL
                                             : &event);
      const size_t work_size = config->wgsize * stack_size;
      const int typesize = OPENCL_LIBSMM_TYPESIZE(datatype);
#  if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
      const int offset_stack_size = offset + stack_size;
      char *imat = NULL, *omat = NULL, *gold = NULL;
      void* scratch = NULL;
      int* stack = NULL;
      size_t data_size;
      if (EXIT_SUCCESS == clGetMemObjectInfo(info_mdata.memory, CL_MEM_SIZE, sizeof(size_t), &data_size, NULL)) {
        const size_t scratch_size = (sizeof(int) * offset_stack_size) /*stack*/
                                    + data_size /*imat*/ + data_size /*omat*/ + (mn * typesize) /*gold*/
                                    + 3 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
        scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
        if (NULL != scratch) {
          stack = (int*)scratch;
          imat = (char*)LIBXSMM_UP2((uintptr_t)stack + sizeof(int) * offset_stack_size, LIBXSMM_ALIGNMENT);
          omat = (char*)LIBXSMM_UP2((uintptr_t)imat + data_size, LIBXSMM_ALIGNMENT);
          gold = (char*)LIBXSMM_UP2((uintptr_t)omat + data_size, LIBXSMM_ALIGNMENT);
          ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_trs_stack, stack, sizeof(int) * offset_stack_size, stream),
            "transfer validation stack", result);
          ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, imat, data_size, stream), "transfer validation input", result);
        }
        else result = EXIT_FAILURE;
      }
      else {
        result = EXIT_FAILURE;
      }
#  endif
      assert(!(OPENCL_LIBSMM_NLOCKS_TRANS & (OPENCL_LIBSMM_NLOCKS_TRANS - 1))); /* POT */
      { /* calling clSetKernelArg/clEnqueueNDRangeKernel must be consistent */
        static ACC_OPENCL_ATOMIC_LOCKTYPE locks[OPENCL_LIBSMM_NLOCKS_TRANS];
#  if (1 < OPENCL_LIBSMM_NLOCKS_TRANS)
        const unsigned int hash = libxsmm_hash(&config->kernel, sizeof(cl_kernel), 25071975 /*seed*/);
        const unsigned int lidx = LIBXSMM_MOD2(hash, OPENCL_LIBSMM_NLOCKS_TRANS);
        ACC_OPENCL_ATOMIC_LOCKTYPE* const lock = locks + lidx;
#  else
        ACC_OPENCL_ATOMIC_LOCKTYPE* const lock = locks;
#  endif
        ACC_OPENCL_ATOMIC_ACQUIRE(lock);
        ACC_OPENCL_CHECK(
          clSetKernelArg(config->kernel, 0, sizeof(int), &offset), "set offset argument of transpose kernel", result);
        ACC_OPENCL_CHECK(c_dbcsr_acc_opencl_set_kernel_ptr(config->kernel, 1, info_stack.memory),
          "set batch-list argument of transpose kernel", result);
        ACC_OPENCL_CHECK(c_dbcsr_acc_opencl_set_kernel_ptr(config->kernel, 2, info_mdata.memory),
          "set matrix-data argument of transpose kernel", result);
        ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(str->queue, config->kernel, 1 /*work_dim*/, NULL /*offset*/, &work_size,
                           &config->wgsize, 0, NULL, perf_event),
          "launch transpose kernel", result);
        /* eventually update performance counters inside of locked region */
#  if !defined(OPENCL_LIBSMM_VALIDATE_TRANS)
        if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
          if (NULL != perf_event) {
            cl_ulong begin = 0, end = 0;
            clWaitForEvents(1, perf_event);
            ACC_OPENCL_CHECK(clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &begin, NULL),
              "query kernel start time", result);
            ACC_OPENCL_CHECK(clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL),
              "query kernel end time", result);
            duration = 1E-9 * LIBXSMM_DELTA(begin, end); /* Nanoseconds->seconds */
          }
          else {
            clFinish(str->queue);
            duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); /* seconds */
          }
          if (EXIT_SUCCESS == result) {
            const double membw = (1ULL * stack_size * (typesize * m * n)) / (duration * (1ULL << 30));
            LIBXSMM_STDIO_ACQUIRE();
            fprintf(stderr, "INFO ACC/LIBSMM: TRANS-kernel ");
            opencl_libsmm_write_trans_params(
              stderr, 1 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
            fprintf(stderr, "=");
            opencl_libsmm_write_trans_params(stderr, 1 /*only_key*/, &key, config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
            fprintf(stderr, " ss=%i cur=%.1f GB/s dur=%.2g ms\n", stack_size, membw, 1E3 * duration);
            LIBXSMM_STDIO_RELEASE();
          }
        }
#  endif
        ACC_OPENCL_ATOMIC_RELEASE(lock);
      }
#  if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
      ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, omat, data_size, stream), "transfer validation test", result);
#  endif
#  if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
      ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result);
#  endif
#  if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
      if (EXIT_SUCCESS == result) {
        int i, j;
        LIBXSMM_STDIO_ACQUIRE();
        if (0 != c_dbcsr_acc_opencl_config.verbosity) {
          fprintf(stderr,
            "libsmm_acc_transpose("
            "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)",
            offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n,
            max_kernel_dim, stream);
        }
        for (i = offset; i < offset_stack_size; ++i) {
          const size_t index = stack[i];
          const char* const orig = imat + index * typesize;
          const char* const test = omat + index * typesize;
          assert((index * typesize) < data_size);
          memcpy(gold, orig, mn * typesize);
          libxsmm_itrans(gold, typesize, m, n, m, n);
          if (0 != memcmp(gold, test, mn * typesize)) {
            if (0 == c_dbcsr_acc_opencl_config.verbosity) {
              fprintf(stderr,
                "libsmm_acc_transpose("
                "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)",
                offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m,
                n, max_kernel_dim, stream);
            }
            fprintf(stderr, " => ERROR\n");
            if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
              fprintf(stderr, "stackposition = %i (index=%llu)\n", i, (unsigned long long)index);
              opencl_libsmm_print_matrix(stderr, "orig = ", datatype, orig, m, n);
              opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold, n, m);
              opencl_libsmm_print_matrix(stderr, "test = ", datatype, test, n, m);
              fprintf(stderr, "\n");
            }
#    if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
            exit(EXIT_FAILURE);
#    else
            result = EXIT_FAILURE;
            break;
#    endif
          }
          for (j = offset; j < i; ++j) {
            const size_t duplicate = stack[j];
            if (index == duplicate) {
              fprintf(stderr, " => ERROR\n");
#    if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
              exit(EXIT_FAILURE);
#    else
              i = offset_stack_size;
              result = EXIT_FAILURE;
              break;
#    endif
            }
          }
        }
        if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
          fprintf(stderr, " => OK\n");
        }
        LIBXSMM_STDIO_RELEASE();
      }
      libxsmm_free(scratch);
#  endif
    }
  }
  ACC_OPENCL_RETURN(result);
}


c_dbcsr_acc_bool_t libsmm_acc_process_suitable(
  c_dbcsr_acc_bool_t def_mnk, libsmm_acc_data_t datatype, int stack_size, int m_max, int n_max, int k_max, int max_kernel_dim) {
  c_dbcsr_acc_bool_t result = 0; /* false */
  if (0 < m_max && 0 < n_max && 0 < k_max && 0 < stack_size &&
      0 != def_mnk /*homogeneous*/
      /* allow k_max to exceed max_kernel_dim, TODO: BLAS for large kernels (m,n) */
      && m_max <= max_kernel_dim && n_max <= max_kernel_dim)
  {
    switch (datatype) {
#  if defined(OPENCL_LIBSMM_F64)
      case dbcsr_type_real_8: {
        result = 1; /* true */
      } break;
#  endif
#  if defined(OPENCL_LIBSMM_F32)
      case dbcsr_type_real_4: {
        result = 1; /* true */
      } break;
#  endif
      default: assert(/*false*/ 0 == result);
    }
  }
  if ((/*false*/ 0 == result) && (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity)) {
    opencl_libsmm_smmkey_t key;
    opencl_libsmm_smm_t dummy;
    key.type = datatype;
    key.m = m_max;
    key.n = n_max;
    key.k = k_max; /* initialize key */
    memset(&dummy, 0, sizeof(dummy)); /* mute warnings about potentially uninitialized data */
    LIBXSMM_STDIO_ACQUIRE();
    fprintf(stderr, "INFO ACC/LIBSMM: SMM-kernel ");
    opencl_libsmm_write_smm_params(stderr, 1 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
    fprintf(stderr, "=");
    opencl_libsmm_write_smm_params(stderr, 1 /*only_key*/, &key, &dummy, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
    fprintf(stderr, " ss=%i", stack_size);
    if (m_max <= max_kernel_dim && n_max <= max_kernel_dim) {
      fprintf(stderr, 0 != def_mnk ? " is ignored\n" : " is inhomogeneous\n");
    }
    else fprintf(stderr, " is too large\n");
    LIBXSMM_STDIO_RELEASE();
  }
  return result;
}


int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, int stack_size, libsmm_acc_data_t datatype,
  const void* dev_a_data, const void* dev_b_data, void* dev_c_data, int m_max, int n_max, int k_max, int max_kernel_dim,
  c_dbcsr_acc_bool_t def_mnk, void* stream, void* c_stream) {
  int result = EXIT_SUCCESS;
  const int nparams = 3;
  LIBXSMM_UNUSED(c_stream); /* TODO */
  assert(0 == stack_size || (NULL != dev_a_data && NULL != dev_b_data && NULL != dev_c_data));
  assert(0 == stack_size || (NULL != host_param_stack && NULL != dev_param_stack));
  assert(0 < nparams && 0 < max_kernel_dim && NULL != stream);
  assert(0 <= stack_size && 0 <= m_max && 0 <= n_max && 0 <= k_max);
  if (0 != libsmm_acc_process_suitable(def_mnk, datatype, stack_size, m_max, n_max, k_max, max_kernel_dim)) {
    c_dbcsr_acc_opencl_info_memptr_t info_stack, info_adata, info_bdata, info_cdata;
    opencl_libsmm_smmkey_t key;
#  if !defined(OPENCL_LIBSMM_VALIDATE_SMM)
    double duration;
    const libxsmm_timer_tickint start = libxsmm_timer_tick();
#  endif
    const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream);
    LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */
    key.devuid = ((1 != c_dbcsr_acc_opencl_config.devmatch && ((unsigned int)-1) != c_dbcsr_acc_opencl_config.devmatch)
                    ? c_dbcsr_acc_opencl_config.devmatch
                    : c_dbcsr_acc_opencl_config.device.uid);
    key.type = datatype;
    key.m = m_max;
    key.n = n_max;
    key.k = k_max;
    result |= c_dbcsr_acc_opencl_info_devptr(&info_stack, dev_param_stack, sizeof(int), NULL /*amount*/, NULL /*offset*/);
    result |= c_dbcsr_acc_opencl_info_devptr(&info_adata, dev_a_data, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/);
    result |= c_dbcsr_acc_opencl_info_devptr(&info_bdata, dev_b_data, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/);
    result |= c_dbcsr_acc_opencl_info_devptr(&info_cdata, dev_c_data, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/);
    if (EXIT_SUCCESS == result) {
      static ACC_OPENCL_ATOMIC_LOCKTYPE locks[OPENCL_LIBSMM_NLOCKS_SMM];
      const char *const env_s = getenv("OPENCL_LIBSMM_SMM_S"), *const env_bs = getenv("OPENCL_LIBSMM_SMM_BS");
      const int s = ((NULL == env_s || '\0' == *env_s) ? OPENCL_LIBSMM_SMM_S : atoi(env_s));
      int kernel_idx = 0, bs = ((NULL == env_bs || '\0' == *env_bs) ? 0 : atoi(env_bs));
      opencl_libsmm_smm_t* config;
      ACC_OPENCL_ATOMIC_LOCKTYPE* lock = locks;
#  if (1 < OPENCL_LIBSMM_NLOCKS_SMM)
      assert(!(OPENCL_LIBSMM_NLOCKS_SMM & (OPENCL_LIBSMM_NLOCKS_SMM - 1))); /* POT */
      lock += LIBXSMM_MOD2(libxsmm_hash(&key, sizeof(key), 25071975 /*seed*/), OPENCL_LIBSMM_NLOCKS_SMM);
#  endif
      ACC_OPENCL_ATOMIC_ACQUIRE(lock); /* calling clSetKernelArg/clEnqueueNDRangeKernel must be consistent */
      config = (opencl_libsmm_smm_t*)libxsmm_xdispatch(&key, sizeof(key));
      if (0 >= bs) bs = ((NULL != config && 0 < config->bs) ? config->bs : OPENCL_LIBSMM_DEFAULT_BS);
      /* determine kernel-kind (mini-batch vs. mini-kernel) */
      if (1 == bs || 0 > s || (bs * s) > stack_size) kernel_idx = bs = 1;
      if (NULL == config || NULL == config->kernel[kernel_idx]) {
        char buffer[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE], fname[ACC_OPENCL_MAXSTRLEN];
        int nchar = LIBXSMM_SNPRINTF(fname, sizeof(fname),
          /* kernel name are meant to be unambiguous (BLAS-typeprefix and kernelsize) */
          "x" OPENCL_LIBSMM_KERNELNAME_SMM "%ix%ix%i", m_max, n_max, k_max);
#  if defined(__DBCSR_ACC)
        int routine_handle;
        c_dbcsr_timeset(LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR, LIBSMM_ACC_PROCESS_ROUTINE_NAME_LENPTR, &routine_handle);
#  endif
        result = ((0 < nchar && (int)sizeof(fname) > nchar) ? EXIT_SUCCESS : EXIT_FAILURE);
        if (EXIT_SUCCESS == result) {
          c_dbcsr_acc_opencl_atomic_fp_t tkind = c_dbcsr_acc_opencl_atomic_fp_no;
          const char* tname = NULL;
          switch (datatype) {
            case dbcsr_type_real_8: {
              tkind = c_dbcsr_acc_opencl_atomic_fp_64;
              tname = "double";
              fname[0] = 'd';
            } break;
            case dbcsr_type_real_4: {
              tkind = c_dbcsr_acc_opencl_atomic_fp_32;
              tname = "float";
              fname[0] = 's';
            } break;
            default: assert(NULL == tname);
          }
          if (NULL != tname) {
            const char *extensions[] = {NULL, NULL}, *const env_devid = getenv("OPENCL_LIBSMM_SMM_DEVID");
            const unsigned int devuid = (NULL == env_devid || '\0' == *env_devid) ? c_dbcsr_acc_opencl_config.device.uid
                                                                                  : (unsigned int)strtoul(env_devid, NULL, 0);
            size_t nextensions = sizeof(extensions) / sizeof(*extensions), sgs = 0, wgsize_prf = 1;
            const char *const env_bm = getenv("OPENCL_LIBSMM_SMM_BM"), *const env_bn = getenv("OPENCL_LIBSMM_SMM_BN");
            const char *const env_bk = getenv("OPENCL_LIBSMM_SMM_BK"), *const env_ws = getenv("OPENCL_LIBSMM_SMM_WS");
            const char *const env_wg = getenv("OPENCL_LIBSMM_SMM_WG"), *const env_lu = getenv("OPENCL_LIBSMM_SMM_LU");
            const char *const env_nz = getenv("OPENCL_LIBSMM_SMM_NZ"), *const env_al = getenv("OPENCL_LIBSMM_SMM_AL");
            const char *const env_tb = getenv("OPENCL_LIBSMM_SMM_TB"), *const env_tc = getenv("OPENCL_LIBSMM_SMM_TC");
            const char *const env_ap = getenv("OPENCL_LIBSMM_SMM_AP"), *const env_aa = getenv("OPENCL_LIBSMM_SMM_AA");
            const char *const env_ab = getenv("OPENCL_LIBSMM_SMM_AB"), *const env_ac = getenv("OPENCL_LIBSMM_SMM_AC");
            const char *const env_xf = getenv("OPENCL_LIBSMM_SMM_XF"), *const env_cl = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS");
            const char* const intel_xf = "-cl-intel-256-GRF-per-thread";
            const int blockn = ((NULL == env_bn || '\0' == *env_bn) ? 0 : atoi(env_bn));
            const int blockk = ((NULL == env_bk || '\0' == *env_bk) ? 0 : atoi(env_bk));
            const int wgmin = ((NULL == env_ws || '\0' == *env_ws) ? 0 : atoi(env_ws));
            const int default_aa = (((0x0bd0 > devuid || 0x0bdb < devuid)) ? ((k_max % OPENCL_LIBSMM_VMIN) ? 1 : 2) : 0);
            const int default_ab = (((0x0bd0 > devuid || 0x0bdb < devuid) && 0x020a != devuid) ? 3 : 0), default_ac = 0;
            const int default_bk = (((0x0bd0 > devuid || 0x0bdb < devuid) && 0x020a != devuid)
                                      ? (0 == kernel_idx ? LIBXSMM_MIN(OPENCL_LIBSMM_DEFAULT_BK, m_max)
                                                         : LIBXSMM_MIN(OPENCL_LIBSMM_VMIN, m_max))
                                      : 1);
            const int default_wg = (((0x0bd0 > devuid || 0x0bdb < devuid)) ? (0 == kernel_idx ? 0 : -2) : -1);
            const int default_lu = (0 != c_dbcsr_acc_opencl_config.device.intel ? -1 : 0);
            int defaults, blockm, nbm, nbn;
            opencl_libsmm_smm_t new_config;
            if (NULL == config) {
              memset(&new_config, 0, sizeof(new_config));
            }
            else { /* preserve kernels, performance counters, etc. */
              memcpy(&new_config, config, sizeof(opencl_libsmm_smm_t));
            }
            if (NULL == env_xf || '\0' == *env_xf) {
              if (0 != c_dbcsr_acc_opencl_config.device.intel && CL_DEVICE_TYPE_GPU == c_dbcsr_acc_opencl_config.device.type &&
                  NULL != env_cl && NULL != strstr(env_cl, intel_xf))
              {
                new_config.flags = 1;
              }
            }
            else new_config.flags = atoi(env_xf);
            defaults = ((NULL == config || 0 != kernel_idx || (NULL != config && new_config.flags != config->flags)) ? 1 : 0);
            new_config.lu = LIBXSMM_MAX(-2, (NULL == env_lu || '\0' == *env_lu) ? (0 != defaults ? default_lu : config->lu)
                                                                                : atoi(env_lu)); /* populate only lower bound */
            blockm = ((NULL == env_bm || '\0' == *env_bm || 1 < new_config.lu) /* 1<LU ignores BM */
                        ? (1 >= new_config.lu ? 0 : LIBXSMM_UP(m_max / new_config.lu, OPENCL_LIBSMM_VMIN))
                        : atoi(env_bm));
            /* two defaults for new_config parameters: 1st - regular, 2nd - BS=1 kernel */
            new_config.bm = (0 >= blockm ? (0 == kernel_idx ? (0 != defaults ? LIBXSMM_MIN(OPENCL_LIBSMM_DEFAULT_BM, m_max)
                                                                             : LIBXSMM_CLMP(config->bm, 1, m_max))
                                                            : LIBXSMM_MIN(OPENCL_LIBSMM_DEFAULT_BM, m_max))
                                         : LIBXSMM_MIN(blockm, m_max));
            new_config.bn = (0 >= blockn ? (0 == kernel_idx ? (0 != defaults ? LIBXSMM_MIN(OPENCL_LIBSMM_DEFAULT_BN, n_max)
                                                                             : LIBXSMM_CLMP(config->bn, 1, n_max))
                                                            : LIBXSMM_MIN(OPENCL_LIBSMM_DEFAULT_BN, n_max))
                                         : LIBXSMM_MIN(blockn, n_max));
            new_config.bk = (0 >= blockk ? (0 != defaults ? default_bk : LIBXSMM_CLMP(config->bk, 1, m_max))
                                         : LIBXSMM_MIN(blockk, m_max));
            new_config.ws = (0 >= wgmin ? (0 == kernel_idx ? (0 != defaults ? LIBXSMM_MAX(m_max, n_max)
                                                                            : LIBXSMM_CLMP(config->ws, 1, n_max * m_max))
                                                           : LIBXSMM_MAX(m_max, n_max))
                                        : LIBXSMM_MIN(wgmin, n_max * m_max));
            new_config.wg = LIBXSMM_CLMP(
              (NULL == env_wg || '\0' == *env_wg) ? (0 != defaults ? default_wg : config->wg) : atoi(env_wg), -2, 2);
            new_config.nz = LIBXSMM_CLMP(
              (NULL == env_nz || '\0' == *env_nz) ? (0 != defaults ? /*default*/ 0 : config->nz) : atoi(env_nz), 0, 1);
            new_config.al = LIBXSMM_CLMP(/* bug: AL=1 */
              (NULL == env_al || '\0' == *env_al)
                ? (0 == (32 & c_dbcsr_acc_opencl_config.wa) ? (0 != defaults ? 0 : config->al) : 0)
                : atoi(env_al),
              0, 1);
            new_config.tb = LIBXSMM_CLMP(
              (NULL == env_tb || '\0' == *env_tb) ? (0 != defaults ? /*default*/ 0 : config->tb) : atoi(env_tb), 0, 1);
            new_config.tc = LIBXSMM_CLMP(
              (NULL == env_tc || '\0' == *env_tc) ? (0 != defaults ? /*default*/ 1 : config->tc) : atoi(env_tc), 0, 1);
            new_config.ap = LIBXSMM_CLMP(
              (NULL == env_ap || '\0' == *env_ap) ? (0 != defaults ? /*default*/ 0 : config->ap) : atoi(env_ap), 0, 1);
            new_config.aa = LIBXSMM_CLMP(/* bug: AA=2 XF=1 */
              (NULL == env_aa || '\0' == *env_aa) ? (0 != defaults ? default_aa : config->aa) : atoi(env_aa), 0,
              (0 == (64 & c_dbcsr_acc_opencl_config.wa) || 0 == new_config.flags) ? 2 : 1);
            new_config.ab = LIBXSMM_CLMP(
              (NULL == env_ab || '\0' == *env_ab) ? (0 != defaults ? default_ab : config->ab) : atoi(env_ab), 0, 2);
            new_config.ac = LIBXSMM_CLMP(
              (NULL == env_ac || '\0' == *env_ac) ? (0 != defaults ? default_ac : config->ac) : atoi(env_ac), 0, 1);
            if (0 >= new_config.s) new_config.s = stack_size;
            if (0 == kernel_idx || 1 >= new_config.bs) new_config.bs = bs;
            nbm = (m_max + new_config.bm - 1) / new_config.bm;
            nbn = (n_max + new_config.bn - 1) / new_config.bn;
            new_config.wgsize[kernel_idx] = LIBXSMM_MAX(nbm * nbn, new_config.ws);
            if (0 != new_config.wg) {
              if (0 != c_dbcsr_acc_opencl_config.device.wgsize[2]) { /* subgroups supported */
                if (new_config.wgsize[kernel_idx] <= c_dbcsr_acc_opencl_config.device.wgsize[2]) {
                  sgs = c_dbcsr_acc_opencl_config.device.wgsize[2];
                }
                else if (new_config.wgsize[kernel_idx] <= c_dbcsr_acc_opencl_config.device.wgsize[1]) {
                  sgs = c_dbcsr_acc_opencl_config.device.wgsize[1];
                }
              }
              wgsize_prf = LIBXSMM_UP(new_config.wgsize[kernel_idx], 0 != sgs ? sgs : c_dbcsr_acc_opencl_config.device.wgsize[1]);
            }
            else { /* cover exactly */
              wgsize_prf = new_config.wgsize[kernel_idx];
            }
            if (2 <= new_config.wg) wgsize_prf = LIBXSMM_UP2POT(wgsize_prf);
            if (wgsize_prf < (2 * new_config.wgsize[kernel_idx])) new_config.wgsize[kernel_idx] = wgsize_prf; /* limit */
            assert(1 <= bs && 0 < new_config.wgsize[kernel_idx] && 0 < wgsize_prf);
            /* ensure minimum requested WG-size */
            while ((nbm * nbn) < new_config.ws && (nbm < m_max || nbn < n_max)) {
              if (nbn < n_max) ++nbn;
              else if (nbm < m_max) ++nbm;
            }
            if ((nbm * nbn) < new_config.ws) {
              new_config.bn = (n_max + nbn - 1) / nbn;
              new_config.bm = (m_max + nbm - 1) / nbm;
              new_config.wgsize[kernel_idx] = (2 > new_config.wg ? (nbm * nbn) : ((int)LIBXSMM_UP2POT(nbm * nbn)));
            }
            else { /* reset */
              nbm = (m_max + new_config.bm - 1) / new_config.bm;
              nbn = (n_max + new_config.bn - 1) / new_config.bn;
            }
            /* limit WG-size to maximum WG-size */
            while (c_dbcsr_acc_opencl_config.device.wgsize[0] < new_config.wgsize[kernel_idx] &&
                   (new_config.bm < m_max || new_config.bn < n_max))
            {
              if (new_config.bn < n_max) {
                ++new_config.bn;
                nbn = (n_max + new_config.bn - 1) / new_config.bn;
              }
              else if (new_config.bm < m_max) {
                ++new_config.bm;
                nbm = (m_max + new_config.bm - 1) / new_config.bm;
              }
              new_config.wgsize[kernel_idx] = (2 > new_config.wg ? (nbm * nbn) : ((int)LIBXSMM_UP2POT(nbm * nbn)));
            }
            if (new_config.wgsize[kernel_idx] <= c_dbcsr_acc_opencl_config.device.wgsize[0]) { /* SMM can be handled by device */
              const char* const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(c_dbcsr_acc_opencl_config.device.id) ? "global"
                                                                                                                    : "constant");
              const char* const env_nrepeat = getenv("SMM_NREPEAT");
              const int typesize = OPENCL_LIBSMM_TYPESIZE(datatype);
              const int slm_a = (1 != new_config.aa ? 0 : (LIBXSMM_ISPOT(k_max * typesize) + 1));
              const int slm_b = (1 != new_config.ab ? 0 : (LIBXSMM_ISPOT(k_max * typesize) + 1));
              const int slm_c = (1 != new_config.ac ? 0 : (LIBXSMM_ISPOT(m_max * typesize) + 1));
              /* compose build parameters and flags */
              nchar = LIBXSMM_SNPRINTF(build_params, sizeof(build_params),
                "-DT=%s -DGPU=%u -DGLOBAL=%s -DSWG=%i -DSGS=%i -DFN=%s -DREPEAT=%i -DLU=%i "
                "-DSM=%i -DSN=%i -DSK=%i -DBS=%i -DVL=%i %s -DBM=%i -DBN=%i -DBK=%i "
                "%s %s %s %s %s %s %s %s ", /* space! */
                tname, CL_DEVICE_TYPE_GPU == c_dbcsr_acc_opencl_config.device.type, cmem, (int)new_config.wgsize[kernel_idx],
                (int)sgs, fname, NULL == env_nrepeat ? 1 : atoi(env_nrepeat), new_config.lu, m_max, n_max, k_max, bs,
                OPENCL_LIBSMM_VMIN, bs == new_config.bs ? "-DBSC" : "", new_config.bm, new_config.bn, new_config.bk,
                0 == new_config.tb ? "" : "-DTRACK_B", 0 != new_config.tc ? "-DTRACK_C" : "",
                0 == new_config.nz ? "" : "-DATOMIC_INC_NZ", 0 == new_config.al ? "" : "-DAL", 0 == new_config.ap ? "" : "-DSLM_P",
                0 == new_config.aa ? "" : (1 == slm_a ? "-DSLM_A=1" : (0 != slm_a ? "-DSLM_A=2" : "-DREG_A")),
                0 == new_config.ab ? "" : (1 == slm_b ? "-DSLM_B=1" : (0 != slm_b ? "-DSLM_B=2" : "-DREG_B")),
                0 == new_config.ac ? "" : (1 == slm_c ? "-DSLM_C=1" : "-DSLM_C=2"));
              /* apply support for FP-atomics */
              if (0 < nchar && (int)sizeof(build_params) > nchar) {
                nchar = c_dbcsr_acc_opencl_flags_atomics(&c_dbcsr_acc_opencl_config.device, tkind, extensions, &nextensions,
                  build_params + nchar, sizeof(build_params) - nchar);
              }
              else result = EXIT_FAILURE;
              if (0 < nchar && (int)sizeof(build_params) > nchar) {
                const char* const cl_debug = ((0 != c_dbcsr_acc_opencl_config.debug &&
                                                0 != c_dbcsr_acc_opencl_config.device.intel &&
                                                CL_DEVICE_TYPE_CPU != c_dbcsr_acc_opencl_config.device.type)
                                                ? "-gline-tables-only"
                                                : "");
                nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s %s %s %s",
                  (0 == new_config.flags || 0 == c_dbcsr_acc_opencl_config.device.intel ||
                    CL_DEVICE_TYPE_GPU != c_dbcsr_acc_opencl_config.device.type)
                    ? ""
                    : intel_xf,
                  cl_debug, 0 == c_dbcsr_acc_opencl_config.debug ? "-cl-fast-relaxed-math -cl-denorms-are-zero" : "",
                  NULL == env_cl ? "" : env_cl);
                if (0 >= nchar || (int)sizeof(buffer) <= nchar) result = EXIT_FAILURE;
              }
              else result = EXIT_FAILURE;
            }
            else { /* matrix-size causes too large WG-size */
              result = EXIT_FAILURE;
            }
            if (EXIT_SUCCESS == result) {
              const char* const env_kernel = getenv("OPENCL_LIBSMM_SMM_KERNEL");
              result = c_dbcsr_acc_opencl_kernel(NULL == env_kernel ? 0 : 1,
                NULL == env_kernel ? OPENCL_KERNELS_SOURCE_MULTIPLY : env_kernel, fname, build_params, buffer, NULL /*cl_try*/,
                NULL /*cl_try_ok*/, extensions, nextensions, new_config.kernel + kernel_idx);
              if (EXIT_SUCCESS == result) {
                size_t wgsize_max_kernel = c_dbcsr_acc_opencl_config.device.wgsize[0];
                result = clGetKernelWorkGroupInfo(new_config.kernel[kernel_idx], c_dbcsr_acc_opencl_config.device.id,
                  CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgsize_max_kernel, NULL);
                if (EXIT_SUCCESS == result) {
                  assert(0 < new_config.wgsize[kernel_idx] && 0 < wgsize_max_kernel);
                  assert(wgsize_max_kernel <= c_dbcsr_acc_opencl_config.device.wgsize[0]);
                  if (new_config.wgsize[kernel_idx] <= wgsize_max_kernel) { /* check planned WG-size vs kernel-specific WG-size */
                    if (NULL == config || NULL == config->kernel[kernel_idx]) {
                      config = (opencl_libsmm_smm_t*)libxsmm_xregister(&key, sizeof(key), sizeof(new_config), &new_config);
                    }
                    if (NULL != config) {
#  if !defined(OPENCL_LIBSMM_VALIDATE_SMM)
                      if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
                        LIBXSMM_STDIO_ACQUIRE();
                        duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
                        fprintf(stderr, "INFO ACC/LIBSMM: SMM-kernel ");
                        opencl_libsmm_write_smm_params(
                          stderr, 0 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
                        fprintf(stderr, "=");
                        opencl_libsmm_write_smm_params(
                          stderr, 0 /*only_key*/, &key, &new_config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
                        fprintf(stderr, " gen=%.1f ms\n", 1E3 * duration);
                        LIBXSMM_STDIO_RELEASE();
                      }
#  endif
                    }
                    /* failed to register config */
                    else result = EXIT_FAILURE;
                  }
                  else {
                    if (0 != c_dbcsr_acc_opencl_config.verbosity) {
                      fprintf(stderr, "ERROR LIBSMM: tile-size causes too large WG-size (min(%u,%u) < %u)!\n",
                        (unsigned int)wgsize_max_kernel, (unsigned int)c_dbcsr_acc_opencl_config.device.wgsize[0],
                        (unsigned int)new_config.wgsize[kernel_idx]);
                    }
                    result = EXIT_FAILURE; /* tile-size causes too large WG-size */
                  }
                }
              }
#  if defined(NDEBUG)
              else if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
                LIBXSMM_STDIO_ACQUIRE();
                fprintf(stderr, "WARNING: SMM-kernel ");
                opencl_libsmm_write_smm_params(
                  stderr, 0 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
                fprintf(stderr, "=");
                opencl_libsmm_write_smm_params(
                  stderr, 0 /*only_key*/, &key, &new_config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
                fprintf(stderr, " failed to compile!\n");
                LIBXSMM_STDIO_RELEASE();
              }
#  endif
            }
          }
          /* insufficient device capabilities */
          else result = EXIT_FAILURE;
        }
        /* remove configuration from registry to avoid infinitely retrying code generation */
        if (EXIT_SUCCESS != result && NULL != config) {
          libxsmm_xrelease(&key, sizeof(key));
        }
#  if defined(__DBCSR_ACC)
        c_dbcsr_timestop(&routine_handle);
#  endif
      }
      assert(EXIT_SUCCESS != result || (NULL != config && NULL != config->kernel[kernel_idx]));
      assert(EXIT_SUCCESS != result || (1 <= config->bm && config->bm <= m_max));
      assert(EXIT_SUCCESS != result || (1 <= config->bn && config->bn <= n_max));
      assert(EXIT_SUCCESS != result || (1 <= config->bk && config->bk <= m_max));
      assert(EXIT_SUCCESS != result || (1 <= config->ws && config->ws <= (m_max * n_max)));
      assert(EXIT_SUCCESS != result || (-2 <= config->wg && 2 >= config->wg));
      assert(EXIT_SUCCESS != result || (-2 <= config->lu /*&& 2 >= config->lu*/));
      assert(EXIT_SUCCESS != result || (0 <= config->nz && 1 >= config->nz));
      assert(EXIT_SUCCESS != result || (0 <= config->al && 1 >= config->al));
      assert(EXIT_SUCCESS != result || (0 <= config->tb && 1 >= config->tb));
      assert(EXIT_SUCCESS != result || (0 <= config->tc && 1 >= config->tc));
      assert(EXIT_SUCCESS != result || (0 <= config->ap && 1 >= config->ap));
      assert(EXIT_SUCCESS != result || (0 <= config->aa && 2 >= config->aa));
      assert(EXIT_SUCCESS != result || (0 <= config->ab && 2 >= config->ab));
      assert(EXIT_SUCCESS != result || (0 <= config->ac && 1 >= config->ac));
      assert(EXIT_SUCCESS != result || (1 <= config->wgsize[kernel_idx]));
      assert(EXIT_SUCCESS != result || (1 <= config->s && 1 <= config->bs));
      if (EXIT_SUCCESS == result) {
        cl_event event, *const perf_event =
                          ((c_dbcsr_acc_opencl_timer_host == c_dbcsr_acc_opencl_config.timer ||
                             (0 <= c_dbcsr_acc_opencl_config.verbosity && 2 >= c_dbcsr_acc_opencl_config.verbosity))
                              ? NULL
                              : &event);
        size_t work_size;
#  if defined(OPENCL_LIBSMM_VALIDATE_SMM)
        /* validate result (implies readback from device and performance penalty) */
        int* pinp = NULL;
        char *ainp = NULL, *binp = NULL, *test = NULL, *gold = NULL, *btrn = NULL;
        const libxsmm_datatype precision =
          (dbcsr_type_real_8 == datatype ? LIBXSMM_DATATYPE_F64
                                         : (dbcsr_type_real_4 == datatype ? LIBXSMM_DATATYPE_F32 : LIBXSMM_DATATYPE_UNSUPPORTED));
        const int typesize = OPENCL_LIBSMM_TYPESIZE(datatype);
        libxsmm_xmmfunction kernel_cpu = {NULL};
        size_t psize, asize, bsize, csize;
        void* scratch = NULL;
        if (EXIT_SUCCESS == clGetMemObjectInfo(info_stack.memory, CL_MEM_SIZE, sizeof(size_t), &psize, NULL) &&
            EXIT_SUCCESS == clGetMemObjectInfo(info_adata.memory, CL_MEM_SIZE, sizeof(size_t), &asize, NULL) &&
            EXIT_SUCCESS == clGetMemObjectInfo(info_bdata.memory, CL_MEM_SIZE, sizeof(size_t), &bsize, NULL) &&
            EXIT_SUCCESS == clGetMemObjectInfo(info_cdata.memory, CL_MEM_SIZE, sizeof(size_t), &csize, NULL))
        {
          libxsmm_descriptor_blob blob;
          libxsmm_gemm_descriptor* const desc = OPENCL_LIBSMM_DESCINIT(
            &blob, precision, m_max, n_max, k_max, m_max, k_max, m_max, LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_PREFETCH_NONE);
          const size_t scratch_size = psize + asize + bsize + csize + csize + k_max * n_max * typesize +
                                      5 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
          scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
          if (NULL != desc && NULL != scratch) {
            pinp = (int*)scratch;
            ainp = (char*)LIBXSMM_UP2((uintptr_t)pinp + psize, LIBXSMM_ALIGNMENT);
            binp = (char*)LIBXSMM_UP2((uintptr_t)ainp + asize, LIBXSMM_ALIGNMENT);
            test = (char*)LIBXSMM_UP2((uintptr_t)binp + bsize, LIBXSMM_ALIGNMENT);
            gold = (char*)LIBXSMM_UP2((uintptr_t)test + csize, LIBXSMM_ALIGNMENT);
            btrn = (char*)LIBXSMM_UP2((uintptr_t)gold + csize, LIBXSMM_ALIGNMENT);
            ACC_OPENCL_CHECK(
              c_dbcsr_acc_memcpy_d2h(dev_param_stack, pinp, psize, stream), "transfer validation param-data", result);
            ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_a_data, ainp, asize, stream), "transfer validation a-data", result);
            ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_b_data, binp, bsize, stream), "transfer validation b-data", result);
            ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_c_data, gold, csize, stream), "transfer validation c-data", result);
            kernel_cpu = libxsmm_xmmdispatch(desc);
            assert(NULL != kernel_cpu.xmm);
          }
          else result = EXIT_FAILURE;
        }
        else result = EXIT_FAILURE;
#  endif
        /* scale intra-kernel batchsize according to stacksize */
        if (0 == kernel_idx && 1 < config->bs && stack_size < config->s) {
#  if defined(OPENCL_LIBSMM_BS_MIN)
          const int config_bs = LIBXSMM_MAX(config->bs, OPENCL_LIBSMM_BS_MIN);
#  else
          const int config_bs = config->bs;
#  endif
          bs = (stack_size * config_bs + config->s - 1) / (config->s - 1);
          if (config->bs < bs) bs = config->bs;
        }
        /* adjust launchsize according to intra-kernel batchsize */
        work_size = ((stack_size + bs - 1) / bs) * config->wgsize[kernel_idx];
        /* calling clSetKernelArg/clEnqueueNDRangeKernel must be consistent */
        ACC_OPENCL_CHECK(c_dbcsr_acc_opencl_set_kernel_ptr(config->kernel[kernel_idx], 0, info_cdata.memory),
          "set C-matrix argument of SMM-kernel", result);
        ACC_OPENCL_CHECK(c_dbcsr_acc_opencl_set_kernel_ptr(config->kernel[kernel_idx], 1, info_adata.memory),
          "set A-matrix argument of SMM-kernel", result);
        ACC_OPENCL_CHECK(c_dbcsr_acc_opencl_set_kernel_ptr(config->kernel[kernel_idx], 2, info_bdata.memory),
          "set B-matrix argument of SMM-kernel", result);
        ACC_OPENCL_CHECK(c_dbcsr_acc_opencl_set_kernel_ptr(config->kernel[kernel_idx], 3, info_stack.memory),
          "set batch-list argument of SMM-kernel", result);
        if (0 == kernel_idx) {
          assert(bs <= config->bs);
          ACC_OPENCL_CHECK(clSetKernelArg(config->kernel[kernel_idx], 4, sizeof(int), &stack_size),
            "set stacksize argument of SMM-kernel", result);
          ACC_OPENCL_CHECK(
            clSetKernelArg(config->kernel[kernel_idx], 5, sizeof(int), &bs), "set minibatch argument of SMM-kernel", result);
        }
        ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(str->queue, config->kernel[kernel_idx], 1 /*work_dim*/, NULL /*offset*/, &work_size,
                           config->wgsize + kernel_idx, 0, NULL, perf_event),
          "launch SMM-kernel", result);
        /* eventually update performance counters inside of locked region */
#  if !defined(OPENCL_LIBSMM_VALIDATE_SMM)
        if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
          if (NULL != perf_event) {
            cl_ulong begin = 0, end = 0;
            clWaitForEvents(1, perf_event);
            ACC_OPENCL_CHECK(clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &begin, NULL),
              "query kernel start time", result);
            ACC_OPENCL_CHECK(clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL),
              "query kernel end time", result);
            duration = 1E-9 * LIBXSMM_DELTA(begin, end); /* Nanoseconds->seconds */
          }
          else {
            clFinish(str->queue);
            duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); /* seconds */
          }
          if (EXIT_SUCCESS == result) {
            const double gflops = 1E-9 * (2ULL * m_max * n_max * k_max * stack_size) / duration;
            LIBXSMM_STDIO_ACQUIRE();
            fprintf(stderr, "INFO ACC/LIBSMM: SMM-kernel ");
            opencl_libsmm_write_smm_params(
              stderr, 1 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
            fprintf(stderr, "=");
            opencl_libsmm_write_smm_params(stderr, 1 /*only_key*/, &key, config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/);
            fprintf(stderr, " ss=%i cur=%.1f GFLOPS/s dur=%.2g ms\n", stack_size, gflops, 1E3 * duration);
            LIBXSMM_STDIO_RELEASE();
          }
        }
#  endif
#  if defined(OPENCL_LIBSMM_VALIDATE_SMM)
        ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_c_data, test, csize, stream), "transfer validation test", result);
        ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result);
        if (EXIT_SUCCESS == result) {
          const char* const env_tol = getenv("OPENCL_LIBSMM_SMM_TOLERANCE");
          const double tolerance = ((NULL == env_tol || '\0' == *env_tol) ? 1E-3 : atof(env_tol));
          const int* const params = pinp + (4 <= nparams ? (nparams - 4) : 0);
          size_t i;
          LIBXSMM_STDIO_ACQUIRE();
          if (0 != c_dbcsr_acc_opencl_config.verbosity) {
            fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
              dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
              max_kernel_dim, stream);
          }
          for (i = 0; i < ((size_t)stack_size * nparams); i += nparams) {
            const size_t ia = (size_t)(params[i + 0] - 1) * typesize;
            const size_t ib = (size_t)(params[i + 1] - 1) * typesize;
            const size_t ic = (size_t)(params[i + 2] - 1) * typesize;
            assert(ia < asize && ib < bsize && ic < csize);
            libxsmm_otrans(btrn, binp + ib, typesize, n_max, k_max, n_max, k_max);
            kernel_cpu.xmm(ainp + ia, btrn, gold + ic);
          }
          /* some result may be validated multiple times in case of duplicated c-indexes */
          for (i = 0; i < ((size_t)stack_size * nparams); i += nparams) {
            const size_t ic = (size_t)(params[i + 2] - 1) * typesize;
            double epsilon = 0;
            libxsmm_matdiff_info diff;
            libxsmm_matdiff(
              &diff, (libxsmm_datatype)precision, m_max, n_max, gold + ic, test + ic, &m_max /*ldref*/, &m_max /*ldtst*/);
#    if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
            epsilon = libxsmm_matdiff_epsilon(&diff);
#    else
            epsilon = diff.normf_rel;
#    endif
            if (tolerance < epsilon) {
              if (0 == c_dbcsr_acc_opencl_config.verbosity) {
                fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
                  dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
                  max_kernel_dim, stream);
              }
#    if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
              fprintf(stderr, " => ERROR diff=%g (%g != %g)\n", diff.linf_abs, diff.v_ref, diff.v_tst);
#    else
              fprintf(stderr, " => ERROR diff=%g\n", diff.linf_abs);
#    endif
              if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
                fprintf(stderr, "stackposition = %llu (index=%llu)\n", (unsigned long long)i, (unsigned long long)ic);
                opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold + ic, m_max, n_max);
                opencl_libsmm_print_matrix(stderr, "test = ", datatype, test + ic, m_max, n_max);
                fprintf(stderr, "\n");
              }
#    if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
              exit(EXIT_FAILURE);
#    else
              result = EXIT_FAILURE;
              break;
#    endif
            }
          }
          if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
            fprintf(stderr, " => OK\n");
          }
          LIBXSMM_STDIO_RELEASE();
        }
        libxsmm_free(scratch);
#  elif defined(NDEBUG)
        LIBXSMM_UNUSED(nparams);
#  endif
#  if defined(NDEBUG)
        LIBXSMM_UNUSED(host_param_stack);
#  endif
      }
      ACC_OPENCL_ATOMIC_RELEASE(lock);
    }
  }
  else if (0 < stack_size) { /* inhomogeneous, large kernel, or unsupported datatype */
    return -1; /* TODO: document result code to trigger host-fallback */
  }
  ACC_OPENCL_RETURN(result);
}


int c_calculate_norms(const double* mat, int nblks, const int* offsets, const int* nelems, float* norms, void* stream_ptr) {
  LIBXSMM_UNUSED(mat);
  LIBXSMM_UNUSED(nblks);
  LIBXSMM_UNUSED(offsets);
  LIBXSMM_UNUSED(nelems);
  LIBXSMM_UNUSED(norms);
  LIBXSMM_UNUSED(stream_ptr);
  return -1;
}

#  if defined(__cplusplus)
}
#  endif

#endif /*__OPENCL*/
