TP-openmp/BE_OpenMP_2022/norm1/main.c
2023-06-22 20:19:48 +02:00

178 lines
3.5 KiB
C

#include "aux.h"
#include "omp.h"
double norm1(double **A, int m, int n);
double norm1_colmajor(double **A, int m, int n);
double norm1_rowmajor(double **A, int m, int n);
int main(int argc, char **argv)
{
long t_start, t_end;
int m, n, i, j;
double nrm, nrmrm, nrmcm;
double **A;
// Command line argument: matrix size
if (argc == 3)
{
m = atoi(argv[1]); /* the number of matrix rows */
n = atoi(argv[2]); /* the number of matrix cols */
}
else
{
printf("Usage:\n\n ./main n m\n\nwhere m and n are the number of rows and cols in the matrix\n");
return 1;
}
A = (double **)malloc(m * sizeof(double *));
for (i = 0; i < m; i++)
{
A[i] = (double *)malloc(n * sizeof(double));
for (j = 0; j < n; j++)
{
A[i][j] = ((double)rand() / (double)RAND_MAX);
}
}
/* warm up */
nrm = norm1(A, m, n);
t_start = usecs();
nrm = norm1(A, m, n);
t_end = usecs() - t_start;
printf("Sequential -- norm:%8.4f time (usecs):%6ld\n", nrm, t_end);
t_start = usecs();
nrmcm = norm1_colmajor(A, m, n);
t_end = usecs() - t_start;
printf("Col-major -- norm:%8.4f time (usecs):%6ld\n", nrmcm, t_end);
t_start = usecs();
nrmrm = norm1_rowmajor(A, m, n);
t_end = usecs() - t_start;
printf("Row-major -- norm:%8.4f time (usecs):%6ld\n", nrmrm, t_end);
printf("\n");
return 0;
}
double norm1(double **A, int m, int n)
{
int i, j;
double nrm, tmp;
nrm = 0.0;
for (j = 0; j < n; j++)
{
tmp = 0.0;
for (i = 0; i < m; i++)
{
tmp += fabs(A[i][j]);
}
if (tmp > nrm)
nrm = tmp;
}
return nrm;
}
double norm1_colmajor(double **A, int m, int n)
{
int i, j;
double nrm = 0.0;
double tmp;
#pragma omp parallel private(i, j, tmp) shared(nrm)
#pragma omp for
for (j = 0; j < n; j++)
{
tmp = 0.0;
for (i = 0; i < m; i++)
{
tmp += fabs(A[i][j]);
}
if (tmp > nrm)
{
#pragma omp atomic write
nrm = tmp;
}
}
return nrm;
}
double norm1_rowmajor(double **A, int m, int n)
{
int i, j;
double nrm, *tmp;
nrm = 0.0;
tmp = (double *)malloc(n * sizeof(double));
for (j = 0; j < n; j++)
{
tmp[j] = 0.0;
}
#pragma omp parallel private(i, j) shared(tmp)
#pragma omp for
for (i = 0; i < m; i++)
{
for (j = 0; j < n; j++)
{
#pragma omp atomic update
tmp[j] += fabs(A[i][j]);
}
}
#pragma omp single
{
for (int k = 0; k < n; k++)
{
if (tmp[k] > nrm)
{
nrm = tmp[k];
}
}
free(tmp);
}
return nrm;
}
/*
Résultats: n=100000 m=1000
threads = 1:
Sequential -- norm:50277.9944 time (usecs):770322
Col-major -- norm:50277.9944 time (usecs):766831
Row-major -- norm:50277.9944 time (usecs):775600
threads = 2:
Sequential -- norm:50277.9944 time (usecs):745682
Col-major -- norm:50277.9944 time (usecs):394515
Row-major -- norm:50277.9944 time (usecs):954254
threads = 4:
Sequential -- norm:50277.9944 time (usecs):747250
Col-major -- norm:50277.9944 time (usecs):296826
Row-major -- norm:50277.9944 time (usecs):559891
threads = 30:
Sequential -- norm:50277.9944 time (usecs):748683
Col-major -- norm:50277.9944 time (usecs):253433
Row-major -- norm:50277.9944 time (usecs):436225
On observe que col-major est la plus rapide.
Cela est surement dû au atomic update nécéssaire dans la boucle for.
Une version meilleur serait de créer une liste tmp par thread, et de ensuite les sommer à la fin,
mais une ébauche non finis est disponible dans le fichier main.c.broken.
*/