#include "immintrin.h" #define A(i,j) A[(i)+(j)*LDA] #define B(i,j) B[(i)+(j)*LDB] #define C(i,j) C[(i)+(j)*LDC] #define M_BLOCKING 192 #define N_BLOCKING 2048 #define K_BLOCKING 384 void scale_c_k9(double *C,int M, int N, int LDC, double scalar){ int i,j; for (i=0;i7;count_first+=8,count_sub-=8){ tosrc=src+count_first; for(count_second=0;count_second3;count_first+=4,count_sub-=4){ tosrc=src+count_first; for(count_second=0;count_secondN_BLOCKING)?N_BLOCKING:N-n_count; for (k_count=0;k_countK_BLOCKING)?K_BLOCKING:K-k_count; packing_b_k9(B+k_count+n_count*LDB,b_buffer,LDB,k_inc,n_inc); for (m_count=0;m_countM_BLOCKING)?M_BLOCKING:N-m_count; packing_a_k9(A+m_count+k_count*LDA,a_buffer,LDA,m_inc,k_inc); //macro kernel: to compute C += A_tilt * B_tilt macro_kernel_gemm_k9(m_inc,n_inc,k_inc,alpha,a_buffer, LDA, b_buffer, LDB, &C(m_count, n_count), LDC); } } } free(a_buffer);free(b_buffer); }