#include "immintrin.h" #define A(i,j) A[(i)+(j)*LDA] #define B(i,j) B[(i)+(j)*LDB] #define C(i,j) C[(i)+(j)*LDC] #define M_BLOCKING 192 #define N_BLOCKING 2048 #define K_BLOCKING 384 void scale_c_k11(double *C,int M, int N, int LDC, double scalar){ int i,j; for (i=0;i23;count_first+=24,count_sub-=24){ tosrc=src+count_first; for(count_second=0;count_second7;count_first+=8,count_sub-=8){ tosrc=src+count_first; for(count_second=0;count_second1;count_first+=2,count_sub-=2){ tosrc=src+count_first; for(count_second=0;count_second0;count_first+=1,count_sub-=1){ tosrc=src+count_first; for(count_second=0;count_second1;count_second+=2,count_sub-=2){ tosrc1=src+count_second*leading_dim;tosrc2=tosrc1+leading_dim; for (count_first=0;count_first0;count_second++,count_sub-=1){ tosrc1=src+count_second*leading_dim; for (count_first=0;count_first23;m_count_sub-=24,m_count+=24){ //call the micro kernel: m24n8; i=m_count;j=0;ptr_packing_a=a_buffer+m_count*K; ptr_packing_b0=b_buffer;ptr_packing_b1=ptr_packing_b0+2*K; ptr_packing_b2=ptr_packing_b1+2*K;ptr_packing_b3=ptr_packing_b2+2*K; macro_kernel_24xkx8_packing_avx512_v2 } for (;m_count_sub>7;m_count_sub-=8,m_count+=8){ //call the micro kernel: m8n8; i=m_count;j=0;ptr_packing_a=a_buffer+m_count*K; ptr_packing_b0=b_buffer;ptr_packing_b1=ptr_packing_b0+2*K; ptr_packing_b2=ptr_packing_b1+2*K;ptr_packing_b3=ptr_packing_b2+2*K; macro_kernel_8xkx8_packing_avx512_v2 } for (;m_count_sub>1;m_count_sub-=2,m_count+=2){ //call the micro kernel: m2n8; i=m_count;j=0;ptr_packing_a=a_buffer+m_count*K; ptr_packing_b0=b_buffer;ptr_packing_b1=ptr_packing_b0+2*K; ptr_packing_b2=ptr_packing_b1+2*K;ptr_packing_b3=ptr_packing_b2+2*K; macro_kernel_2xkx8_packing_avx512_v2 } for (;m_count_sub>0;m_count_sub-=1,m_count+=1){ //call the micro kernel: m1n8; i=m_count;j=0;ptr_packing_a=a_buffer+m_count*K; ptr_packing_b0=b_buffer;ptr_packing_b1=ptr_packing_b0+2*K; ptr_packing_b2=ptr_packing_b1+2*K;ptr_packing_b3=ptr_packing_b2+2*K; macro_packing_kernel_1xkx8_v2 } } void kernel_n_4_v2_k11(double *a_buffer,double *b_buffer,double *c_ptr,int m,int K,int LDC,double alpha){ int m_count,m_count_sub; int i,j,k; double *C=c_ptr; double sc0,sc1,sc2,sc3,sc4,sc5,sc6,sc7,sa,sb0,sb1,sb2,sb3,sb4,sb5,sb6,sb7; __m128d da,da0,da1,da2,db0,db1,db2,db3; __m128d dc00,dc10,dc20,dc30,dc40,dc50,dc60,dc70; __m512d valpha = _mm512_set1_pd(alpha);//broadcast alpha to a 512-bit vector __m128d dvalpha = _mm_set1_pd(alpha);//broadcast alpha to a 128-bit vector __m512d a,a0,a1,a2,b0,b1,b2,b3; __m512d c00,c01,c02,c10,c11,c12,c20,c21,c22,c30,c31,c32,c40,c41,c42,c50,c51,c52,c60,c61,c62,c70,c71,c72; __m512d c0,c1,c2,c3; double *ptr_packing_a,*ptr_packing_b0,*ptr_packing_b1,*ptr_packing_b2,*ptr_packing_b3; int k_start,k_end,K4; K4=K&-4;k_end=K;k_start=0; for (m_count_sub=m,m_count=0;m_count_sub>23;m_count_sub-=24,m_count+=24){ //call the micro kernel: m24n4; i=m_count;j=0;ptr_packing_a=a_buffer+m_count*K; ptr_packing_b0=b_buffer;ptr_packing_b1=ptr_packing_b0+2*K; macro_kernel_24xkx4_packing_avx512_v2 } for (;m_count_sub>7;m_count_sub-=8,m_count+=8){ //call the micro kernel: m8n4; i=m_count;j=0;ptr_packing_a=a_buffer+m_count*K; ptr_packing_b0=b_buffer;ptr_packing_b1=ptr_packing_b0+2*K; macro_kernel_8xkx4_packing_avx512_v2 } for (;m_count_sub>1;m_count_sub-=2,m_count+=2){ //call the micro kernel: m2n4; i=m_count;j=0;ptr_packing_a=a_buffer+m_count*K; ptr_packing_b0=b_buffer;ptr_packing_b1=ptr_packing_b0+2*K; macro_kernel_2xkx4_packing_avx512_v2 } for (;m_count_sub>0;m_count_sub-=1,m_count+=1){ //call the micro kernel: m1n4; } } void macro_kernel_k11(double *a_buffer,double *b_buffer,int m,int n,int k,double *C, int LDC,double alpha){ int m_count,n_count,m_count_sub,n_count_sub; // printf("m= %d, n=%d, k = %d\n",m,n,k); for (n_count_sub=n,n_count=0;n_count_sub>7;n_count_sub-=8,n_count+=8){ //call the m layer with n=8; kernel_n_8_v2_k11(a_buffer,b_buffer+n_count*k,C+n_count*LDC,m,k,LDC,alpha); } for (;n_count_sub>3;n_count_sub-=4,n_count+=4){ //call the m layer with n=4 kernel_n_4_v2_k11(a_buffer,b_buffer+n_count*k,C+n_count*LDC,m,k,LDC,alpha); } for (;n_count_sub>1;n_count_sub-=2,n_count+=2){ //call the m layer with n=2 } for (;n_count_sub>0;n_count_sub-=1,n_count+=1){ //call the m layer with n=1 } } void mydgemm_cpu_v11(int M, int N, int K, double alpha, double *A, int LDA, double *B, int LDB, double beta, double *C, int LDC){ if (beta != 1.0) scale_c_k11(C,M,N,LDC,beta); double *b_buffer = (double *)aligned_alloc(4096,K_BLOCKING*N_BLOCKING*sizeof(double)); double *a_buffer = (double *)aligned_alloc(4096,K_BLOCKING*M_BLOCKING*sizeof(double)); int m_count, n_count, k_count; int m_inc, n_inc, k_inc; for (n_count=0;n_countN_BLOCKING)?N_BLOCKING:N-n_count; for (k_count=0;k_countK_BLOCKING)?K_BLOCKING:K-k_count; packing_b_k11(B+k_count+n_count*LDB,b_buffer,LDB,k_inc,n_inc); for (m_count=0;m_countM_BLOCKING)?M_BLOCKING:N-m_count; packing_a_k11(A+m_count+k_count*LDA,a_buffer,LDA,m_inc,k_inc); //macro kernel: to compute C += A_tilt * B_tilt macro_kernel_k11(a_buffer, b_buffer, m_inc, n_inc, k_inc, &C(m_count, n_count), LDC, alpha); } } } free(a_buffer);free(b_buffer); }