194 lines
3.8 KiB
Plaintext
194 lines
3.8 KiB
Plaintext
|
#include "aux.h"
|
||
|
#include "omp.h"
|
||
|
#define MAX_THREADS 30
|
||
|
|
||
|
double norm1(double **A, int m, int n);
|
||
|
double norm1_colmajor(double **A, int m, int n);
|
||
|
double norm1_rowmajor(double **A, int m, int n);
|
||
|
|
||
|
int main(int argc, char **argv)
|
||
|
{
|
||
|
long t_start, t_end;
|
||
|
int m, n, i, j;
|
||
|
double nrm, nrmrm, nrmcm;
|
||
|
double **A;
|
||
|
|
||
|
// Command line argument: matrix size
|
||
|
if (argc == 3)
|
||
|
{
|
||
|
m = atoi(argv[1]); /* the number of matrix rows */
|
||
|
n = atoi(argv[2]); /* the number of matrix cols */
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
printf("Usage:\n\n ./main n m\n\nwhere m and n are the number of rows and cols in the matrix\n");
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
A = (double **)malloc(m * sizeof(double *));
|
||
|
for (i = 0; i < m; i++)
|
||
|
{
|
||
|
A[i] = (double *)malloc(n * sizeof(double));
|
||
|
|
||
|
for (j = 0; j < n; j++)
|
||
|
{
|
||
|
A[i][j] = ((double)rand() / (double)RAND_MAX);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* warm up */
|
||
|
nrm = norm1(A, m, n);
|
||
|
|
||
|
t_start = usecs();
|
||
|
nrm = norm1(A, m, n);
|
||
|
t_end = usecs() - t_start;
|
||
|
printf("Sequential -- norm:%8.4f time (usecs):%6ld\n", nrm, t_end);
|
||
|
|
||
|
t_start = usecs();
|
||
|
nrmcm = norm1_colmajor(A, m, n);
|
||
|
t_end = usecs() - t_start;
|
||
|
printf("Col-major -- norm:%8.4f time (usecs):%6ld\n", nrmcm, t_end);
|
||
|
|
||
|
t_start = usecs();
|
||
|
nrmrm = norm1_rowmajor(A, m, n);
|
||
|
t_end = usecs() - t_start;
|
||
|
printf("Row-major -- norm:%8.4f time (usecs):%6ld\n", nrmrm, t_end);
|
||
|
|
||
|
printf("\n");
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
double norm1(double **A, int m, int n)
|
||
|
{
|
||
|
int i, j;
|
||
|
double nrm, tmp;
|
||
|
nrm = 0.0;
|
||
|
|
||
|
for (j = 0; j < n; j++)
|
||
|
{
|
||
|
tmp = 0.0;
|
||
|
for (i = 0; i < m; i++)
|
||
|
{
|
||
|
tmp += fabs(A[i][j]);
|
||
|
}
|
||
|
if (tmp > nrm)
|
||
|
nrm = tmp;
|
||
|
}
|
||
|
|
||
|
return nrm;
|
||
|
}
|
||
|
|
||
|
double norm1_colmajor(double **A, int m, int n)
|
||
|
{
|
||
|
int i, j;
|
||
|
double nrm = 0.0;
|
||
|
double tmp;
|
||
|
|
||
|
#pragma omp parallel private(i, j, tmp) shared(nrm)
|
||
|
|
||
|
#pragma omp for
|
||
|
for (j = 0; j < n; j++)
|
||
|
{
|
||
|
tmp = 0.0;
|
||
|
for (i = 0; i < m; i++)
|
||
|
{
|
||
|
tmp += fabs(A[i][j]);
|
||
|
}
|
||
|
|
||
|
if (tmp > nrm)
|
||
|
{
|
||
|
#pragma omp atomic write
|
||
|
nrm = tmp;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return nrm;
|
||
|
}
|
||
|
|
||
|
double norm1_rowmajor(double **A, int m, int n)
|
||
|
{
|
||
|
int i, j;
|
||
|
double nrm, **tmp, *tmp_sum;
|
||
|
|
||
|
nrm = 0.0;
|
||
|
|
||
|
tmp = (double **)malloc(m * sizeof(double *));
|
||
|
for (i = 0; i < m; i++)
|
||
|
{
|
||
|
tmp[i] = (double *)malloc(MAX_THREADS * sizeof(double));
|
||
|
|
||
|
for (j = 0; j < MAX_THREADS; j++)
|
||
|
{
|
||
|
tmp[i][j] = 0.0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#pragma omp parallel private(i, j) shared(tmp)
|
||
|
|
||
|
#pragma omp for
|
||
|
for (i = 0; i < m; i++)
|
||
|
{
|
||
|
for (j = 0; j < n; j++)
|
||
|
{
|
||
|
#pragma omp atomic update
|
||
|
tmp[j][omp_get_thread_num()] += fabs(A[i][j]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#pragma omp single
|
||
|
{
|
||
|
|
||
|
for (i = 0; i < m; i++)
|
||
|
{
|
||
|
for (j = 0; j < MAX_THREADS; j++)
|
||
|
{
|
||
|
tmp_sum[i] = tmp[i][j];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for (int k = 0; k < n; k++)
|
||
|
{
|
||
|
if (tmp_sum[k] > nrm)
|
||
|
{
|
||
|
nrm = tmp_sum[k];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
free(tmp_sum);
|
||
|
}
|
||
|
|
||
|
return nrm;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
|
||
|
Résultats: n=100000 m=1000
|
||
|
|
||
|
threads = 1:
|
||
|
Sequential -- norm:50277.9944 time (usecs):770322
|
||
|
Col-major -- norm:50277.9944 time (usecs):766831
|
||
|
Row-major -- norm:50277.9944 time (usecs):775600
|
||
|
|
||
|
threads = 2:
|
||
|
Sequential -- norm:50277.9944 time (usecs):745682
|
||
|
Col-major -- norm:50277.9944 time (usecs):394515
|
||
|
Row-major -- norm:50277.9944 time (usecs):954254
|
||
|
|
||
|
threads = 4:
|
||
|
Sequential -- norm:50277.9944 time (usecs):747250
|
||
|
Col-major -- norm:50277.9944 time (usecs):296826
|
||
|
Row-major -- norm:50277.9944 time (usecs):559891
|
||
|
|
||
|
threads = 30:
|
||
|
Sequential -- norm:50277.9944 time (usecs):748683
|
||
|
Col-major -- norm:50277.9944 time (usecs):253433
|
||
|
Row-major -- norm:50277.9944 time (usecs):436225
|
||
|
|
||
|
|
||
|
On observe que col-major est la plus rapide.
|
||
|
Cela est surement dû au atomic update nécéssaire dans la boucle for.
|
||
|
Une version meilleur serait de créer une liste tmp par thread, et de ensuite les sommer à la fin,
|
||
|
une ébauche non finis est disponible dans le fichier main.c.broken
|
||
|
|
||
|
*/
|