#include "immintrin.h" #define A(i,j) A[(i)+(j)*LDA] #define B(i,j) B[(i)+(j)*LDB] #define C(i,j) C[(i)+(j)*LDC] #define M_BLOCKING 192 #define N_BLOCKING 2048 #define K_BLOCKING 384 void scale_c_k8(double *C,int M, int N, int LDC, double scalar){ int i,j; for (i=0;iN_BLOCKING)?N_BLOCKING:N-n_count; for (k_count=0;k_countK_BLOCKING)?K_BLOCKING:K-k_count; for (m_count=0;m_countM_BLOCKING)?M_BLOCKING:N-m_count; //macro kernel: to compute C += A_tilt * B_tilt macro_kernel_gemm_k8(m_inc,n_inc,k_inc,alpha,&A(m_count,k_count), LDA, &B(k_count,n_count), LDB, &C(m_count, n_count), LDC); } } } }