C1 += A1*B1, gemm(): 6.9724 sec
C2 += A2*B2, pgemm(): 0.437025 sec
speedup = 15.9542
|C1-C2| = 0

