//#define TIMER 1 #include "immintrin.h" #include #ifdef TIMER #include "utils.h" #endif #define A(i,j) A[(i)+(j)*LDA] #define B(i,j) B[(i)+(j)*LDB] #define C(i,j) C[(i)+(j)*LDC] #define M_BLOCKING 192 #define N_BLOCKING 2048 #define K_BLOCKING 384 void scale_c_k15(double *C,int M, int N, int LDC, double scalar){ int i,j; for (i=0;i1;count_second+=2,count_sub-=2){ tosrc1=src+count_second*leading_dim;tosrc2=tosrc1+leading_dim; for (count_first=0;count_first0;count_second++,count_sub-=1){ tosrc1=src+count_second*leading_dim; for (count_first=0;count_first23;count_first+=24,count_sub-=24){ tosrc=src+count_first; for(count_second=0;count_second7;count_first+=8,count_sub-=8){ tosrc=src+count_first; for(count_second=0;count_second1;count_first+=2,count_sub-=2){ tosrc=src+count_first; for(count_second=0;count_second0;count_first+=1,count_sub-=1){ tosrc=src+count_first; for(count_second=0;count_second7;n_count_sub-=8,n_count+=8){ //call the m layer with n=8; macro_n8 } for (;n_count_sub>3;n_count_sub-=4,n_count+=4){ //call the m layer with n=4 macro_n4 } for (;n_count_sub>1;n_count_sub-=2,n_count+=2){ //call the m layer with n=2 } for (;n_count_sub>0;n_count_sub-=1,n_count+=1){ //call the m layer with n=1 } } void mydgemm_cpu_v15(\ int M, \ int N, \ int K, \ double alpha, \ double *A, \ int LDA, \ double *B, \ int LDB, \ double beta, \ double *C, \ int LDC)\ { if (beta != 1.0) scale_c_k15(C,M,N,LDC,beta); double *b_buffer = (double *)aligned_alloc(4096,K_BLOCKING*N_BLOCKING*sizeof(double)); double *a_buffer = (double *)aligned_alloc(4096,K_BLOCKING*M_BLOCKING*sizeof(double)); int m_count, n_count, k_count; int m_inc, n_inc, k_inc; for (n_count=0;n_countN_BLOCKING)?N_BLOCKING:N-n_count; for (k_count=0;k_countK_BLOCKING)?K_BLOCKING:K-k_count; packing_b_24x8_edge_version2_k15(B+k_count+n_count*LDB,b_buffer,LDB,k_inc,n_inc); for (m_count=0;m_countM_BLOCKING)?M_BLOCKING:N-m_count; packing_a_24x8_edge_k15(alpha, A+m_count+k_count*LDA,a_buffer,LDA,m_inc,k_inc); //macro kernel: to compute C += A_tilt * B_tilt macro_kernel_k15(a_buffer, b_buffer, m_inc, n_inc, k_inc, &C(m_count, n_count), LDC); } } } free(a_buffer);free(b_buffer); }