This loop has less efficient memory access patterns than a nearby outer loop. To fix: Reorder the loops if possible.
Interchanging is not always possible because of dependencies, which can lead to different results.…
for (int j = 0; j < N; j++)
for (int k = 0; k < N; k++)
c[i][j] = c[i][j] + a[i][k] * b[k][j];
…
void matmul(float *a[], float *b[], float *c[], int N) {
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
for (int k = 0; k < N; k++)
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
…
for (int k = 0; k < N; k++)
for (int j = 0; j < N; j++)
c[i][j] = c[i][j] + a[i][k] * b[k][j];
…
void matmul(float *a[], float *b[], float *c[], int N) {
for (int i = 0; i < N; i++)
for (int k = 0; k < N; k++)
for (int j = 0; j < N; j++)
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}