This commit is contained in:
Laureηt 2023-06-22 20:19:48 +02:00
commit 21c8be4267
Signed by: Laurent
SSH key fingerprint: SHA256:kZEpW8cMJ54PDeCvOhzreNr4FSh6R13CMGH/POoO8DI
149 changed files with 12104 additions and 0 deletions

7
BE_OpenMP_2014/.vscode/settings.json vendored Normal file
View file

@ -0,0 +1,7 @@
{
"files.associations": {
"*.html": "html",
"*.toml": "toml",
"*.tcc": "c"
}
}

13
BE_OpenMP_2014/Makefile Normal file
View file

@ -0,0 +1,13 @@
clean:
(cd derivative_free; make clean)
(cd linked_list; make clean)
(cd matrix_multiplication; make clean)
(cd reduction; make clean)
(cd synchronizations; make clean)

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,31 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
double evaluate(double x, double y){
/* mysleep(0.000001); */
return 1.34*x*x+1.15*y*y+2 + x*y -y-2*x+0.33;
}

View file

@ -0,0 +1,3 @@
long usecs ();
double evaluate(double x, double y);
void mysleep(double sec);

View file

@ -0,0 +1,168 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "aux.h"
#include "omp.h"
#define MAX_THREADS 16
#define MAXIT 1000000
double sequential_minimization(double s, int p, double x0, double y0);
double parallel_minimization(double s, int p, double x0, double y0);
int main(int argc, char **argv)
{
long t_start, t_end;
double s, x0, y0, z;
int i, p;
// Command line argument: array length
if (argc == 3)
{
p = atoi(argv[1]); /* the number of points to be evaluated */
s = atof(argv[2]); /* the step length */
}
else
{
printf("Usage:\n\n ./main p s\n\nwhere p is the number of points around the current minimum where the function has to be evaluated\nand s is the step size.\n");
return 1;
}
/* No need to change this seetings unless for debugging */
x0 = 10;
y0 = 10;
t_start = usecs();
z = sequential_minimization(s, p, x0, y0);
t_end = usecs();
printf("Sequential time : %8.2f msec.\n", ((double)t_end - t_start) / 1000.0);
printf("\n\n");
t_start = usecs();
z = parallel_minimization(s, p, x0, y0);
t_end = usecs();
printf("Parallel time : %8.2f msec.\n", ((double)t_end - t_start) / 1000.0);
return 0;
}
double parallel_minimization(double s, int p, double x0, double y0)
{
int i;
int cnt = 0;
double x, y, z;
double nx, ny, nz;
double min_nx, min_ny, min_nz;
double xyz[MAX_THREADS][3];
int running = 1;
min_nx = x0;
min_ny = y0;
min_nz = evaluate(min_nx, min_ny);
#pragma omp parallel shared(cnt, xyz, running)
xyz[omp_get_thread_num()][0] = x0;
xyz[omp_get_thread_num()][1] = y0;
xyz[omp_get_thread_num()][2] = evaluate(xyz[omp_get_thread_num()][0], xyz[omp_get_thread_num()][1]);
while (cnt < MAXIT && running)
{
x = xyz[omp_get_thread_num()][0];
y = xyz[omp_get_thread_num()][1];
z = xyz[omp_get_thread_num()][2];
/* Evaluate function on the 8 points around the current minimum */
/* The current minimum is included again in the evaluation for
simplicipy; this makes a total of 9 evaluations */
#pragma omp for
for (i = 0; i < p; i++)
{
nx = x + s * cos(2.0 * M_PI * i / ((double)p));
ny = y + s * sin(2.0 * M_PI * i / ((double)p));
nz = evaluate(nx, ny);
/* printf("%f %f %f\n",nx,ny,nz); */
/* If the evaluation at this point is lower than the current
minimum, set this point as the new minimum */
if (nz < xyz[omp_get_thread_num()][2])
{
xyz[omp_get_thread_num()][0] = nx;
xyz[omp_get_thread_num()][1] = ny;
xyz[omp_get_thread_num()][2] = nz;
}
}
/* Uncomment the line below if you want to debug */
/* printf("%4d -- %5.2f %5.2f %10.4f\n",cnt,xyz[0][0], xyz[0][1], xyz[0][2]); */
#pragma omp master
{
// increment counter
cnt++;
// get result from threads
for (i = 0; i < MAX_THREADS; i++)
{
if (min_nz > xyz[i][2])
{
min_nx = xyz[i][0];
min_ny = xyz[i][1];
min_nz = xyz[i][2];
}
}
/* If no improvement over the old minimum, terminate */
if (xyz[omp_get_thread_num()][2] >= z)
{
running = 0;
}
}
}
printf("Minimum found is %.10f at x=%.4f, y=%.4f in %d steps\n", min_nx, min_nz, min_ny, cnt);
return min_nz;
}
double sequential_minimization(double s, int p, double x0, double y0)
{
int cnt, i;
double z, x, y, nx, ny, nz;
double xyz[MAX_THREADS][3];
xyz[0][0] = x0;
xyz[0][1] = y0;
xyz[0][2] = evaluate(xyz[0][0], xyz[0][1]);
for (cnt = 0; cnt < MAXIT; cnt++)
{
x = xyz[0][0];
y = xyz[0][1];
z = xyz[0][2];
/* Evaluate function on the 8 points around the current minimum */
/* The current minimum is included again in the evaluation for
simplicipy; this makes a total of 9 evaluations */
for (i = 0; i < p; i++)
{
nx = x + s * cos(2.0 * M_PI * i / ((double)p));
ny = y + s * sin(2.0 * M_PI * i / ((double)p));
nz = evaluate(nx, ny);
/* printf("%f %f %f\n",nx,ny,nz); */
/* If the evaluation at this point is lower than the current
minimum, set this point as the new minimum */
if (nz < xyz[0][2])
{
xyz[0][2] = nz;
xyz[0][0] = nx;
xyz[0][1] = ny;
}
}
/* Uncomment the line below if you want to debug */
/* printf("%4d -- %5.2f %5.2f %10.4f\n",cnt,xyz[0][0], xyz[0][1], xyz[0][2]); */
/* If no improvement over the old minimum, terminate */
if (xyz[0][2] >= z)
break;
}
printf("Minimum found is %.10f at x=%.4f, y=%.4f in %d steps\n", xyz[0][2], xyz[0][0], xyz[0][1], cnt);
return xyz[0][2];
}

Binary file not shown.

View file

@ -0,0 +1,9 @@
#!/bin/bash
export LBLAS="/mnt/n7fs/ens/tp_abuttari/acml-4.4.0/gfortran64/lib/libacml.a -lgfortran -lm"
export PATH=/mnt/n7fs/ens/tp_abuttari/opt/gcc_trunk/bin/:$PATH;
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/gmp-4.3.2/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/mpfr-2.4.2/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/mpc-0.8.1/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/isl-0.12.2/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/gcc_trunk/lib64/:$LD_LIBRARY_PATH

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,57 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "aux.h"
unsigned long process_node(struct node *node){
mysleep(0.00005);
return node->val+1;
}
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
void init_list(struct node **head){
struct node *curr;
int n, i;
n = 10000;
*head = (struct node*)malloc(sizeof(struct node));
curr = *head;
for(i=0; i<n-1; i++){
curr->val = i+1;
curr->next = (struct node*)malloc(sizeof(struct node));
curr = curr->next;
}
curr->val=1;
curr->next = NULL;
return;
}

View file

@ -0,0 +1,11 @@
struct node{
unsigned long val;
struct node *next;
};
long usecs ();
unsigned long process_node(struct node *node);
void mysleep(double sec);
void init_list(struct node **head);

View file

@ -0,0 +1,121 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "aux.h"
#include "omp.h"
#define MAX_SIZE 10000
unsigned long sequential_sweep(struct node *head);
unsigned long parallel_for_sweep(struct node *head);
unsigned long parallel_task_sweep(struct node *head);
int main(int argc, char **argv)
{
int n, i, s;
long t_start, t_end, save;
int *x;
unsigned long acc, result;
struct node *head, *curr;
init_list(&head);
t_start = usecs();
result = sequential_sweep(head);
t_end = usecs();
printf("Sequential time : %8.2f msec.", ((double)t_end - t_start) / 1000.0);
printf(" -- result: %5ld\n", result);
t_start = usecs();
result = parallel_for_sweep(head);
t_end = usecs();
printf("Parallel for time : %8.2f msec.", ((double)t_end - t_start) / 1000.0);
printf(" -- result: %5ld\n", result);
t_start = usecs();
result = parallel_task_sweep(head);
t_end = usecs();
printf("Parallel task time : %8.2f msec.", ((double)t_end - t_start) / 1000.0);
printf(" -- result: %5ld\n", result);
return 0;
}
unsigned long sequential_sweep(struct node *head)
{
unsigned long acc;
struct node *curr;
curr = head;
acc = 0;
while (curr)
{
/* Loop until the last element in the list and accumulate the
result of nodes processing */
acc += process_node(curr);
curr = curr->next;
}
return acc;
}
unsigned long parallel_for_sweep(struct node *head)
{
unsigned int size = 0;
unsigned long acc = 0;
struct node *curr = head;
struct node **pointers = malloc(sizeof(struct node *) * MAX_SIZE); // utiliser un std::vector en c++
// on trouve la longuer de la liste
while (curr)
{
pointers[size++] = curr;
curr = curr->next;
}
#pragma omp parallel
{
#pragma omp for reduction(+ : acc)
for (int i = 0; i < size; i++)
{
acc += process_node(pointers[i]);
}
#pragma omp master
{
free(pointers);
}
}
return acc;
}
unsigned long parallel_task_sweep(struct node *head)
{
unsigned long acc;
struct node *curr;
#pragma omp parallel
{
curr = head;
acc = 0;
#pragma omp single
{
while (curr)
{
#pragma omp task firstprivate(curr)
{
acc += process_node(curr);
}
curr = curr->next;
}
}
}
return acc;
}

Binary file not shown.

View file

@ -0,0 +1,27 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm $(LBLAS)
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,91 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "aux.h"
int ISEED[4] = {0, 0, 0, 1};
int IONE = 1;
char NoTran = 'N';
double DONE = 1.0, DMONE = -1.0;
double alpha = 1.0, beta = 1.0;
void init_data(block ***a, block ***b, block ***c, block ***d, int n, int nb)
{
int i, j, k, nbnb;
nbnb = nb * nb;
*a = (block **)malloc(n * sizeof(block *));
*b = (block **)malloc(n * sizeof(block *));
*c = (block **)malloc(n * sizeof(block *));
*d = (block **)malloc(n * sizeof(block *));
for (i = 0; i < n; i++)
{
(*a)[i] = (block *)malloc(n * sizeof(block));
(*b)[i] = (block *)malloc(n * sizeof(block));
(*c)[i] = (block *)malloc(n * sizeof(block));
(*d)[i] = (block *)malloc(n * sizeof(block));
for (j = 0; j < n; j++)
{
(*a)[i][j].b = (double *)malloc(nbnb * sizeof(double));
(*b)[i][j].b = (double *)malloc(nbnb * sizeof(double));
(*c)[i][j].b = (double *)malloc(nbnb * sizeof(double));
(*d)[i][j].b = (double *)malloc(nbnb * sizeof(double));
dlarnv_(&IONE, ISEED, &nbnb, (*a)[i][j].b);
dlarnv_(&IONE, ISEED, &nbnb, (*b)[i][j].b);
dlarnv_(&IONE, ISEED, &nbnb, (*c)[i][j].b);
/* for(k=0; k<nbnb; k++){ */
/* (a[i][j].b)[k] = (double)rand() / RAND_MAX; */
/* (b[i][j].b)[k] = (double)rand() / RAND_MAX; */
/* (c[i][j].b)[k] = (double)rand() / RAND_MAX; */
/* } */
memcpy((*d)[i][j].b, (*c)[i][j].b, nbnb * sizeof(double));
}
}
}
void block_mult(block a, block b, block c, int nb)
{
dgemm_(&NoTran, &NoTran, &nb, &nb, &nb,
&alpha,
a.b, &nb,
b.b, &nb,
&beta,
c.b, &nb);
}
void compare_matrices(block **c, block **d, int n, int nb)
{
int i, j, k, nbnb;
double mx;
nbnb = nb * nb;
mx = 0.0;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
for (k = 0; k < nbnb; k++)
{
if (abs((c[i][j].b)[k] - (d[i][j].b)[k]) > mx)
mx = abs((c[i][j].b)[k] - (d[i][j].b)[k]);
}
}
}
printf("The maximum difference on coefficients is %e\n", mx);
}
long usecs()
{
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec * 1000000 + t.tv_usec;
}

View file

@ -0,0 +1,19 @@
struct {
double *b;
} typedef block;
void init_data(block ***a, block ***b, block ***c, block ***d, int n, int nb);
long usecs ();
void block_mult(block a, block b, block c, int nb);
void compare_matrices(block **c, block **d, int n, int nb);
void dgemm_ (char *TRANSA, char *TRANSB,
int *M, int *N, int *K,
double *ALPHA,
double *A, int *LDA,
double *B, int *LDB,
double *BETA,
double *C, int *LDC);
void dlarnv_(int *idist, int *iseed, int *n, double *x);

View file

@ -0,0 +1,77 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include "aux.h"
void sequential_product(block **a, block **b, block **c, int n, int nb);
void parallel_product(block **a, block **b, block **c, int n, int nb);
int main(int argc, char **argv)
{
int n, nb;
long t_start, t_end;
block **a, **b, **c, **d;
// Command line argument: array length
if (argc == 2)
{
n = atoi(argv[1]); /* for a matrix of size nxn blocks */
}
else
{
printf("Usage:\n\n ./main n\n\nwhere n is the number of blocks in rows and columns of the matrices.\n");
return 1;
}
/* Statically fixed to 100, no need to change this unless for debugging */
nb = 100;
init_data(&a, &b, &c, &d, n, nb);
/* Sequential version */
t_start = usecs();
sequential_product(a, b, c, n, nb);
t_end = usecs();
printf("Sequential time : %8.2f msec.\n", ((double)t_end - t_start) / 1000.0);
/* Parallel with tasks */
t_start = usecs();
parallel_product(a, b, d, n, nb);
t_end = usecs();
printf("Parallel time : %8.2f msec.\n", ((double)t_end - t_start) / 1000.0);
/* Comprare the two resulting matrices */
compare_matrices(c, d, n, nb);
return 0;
}
void sequential_product(block **a, block **b, block **c, int n, int nb)
{
int i, j, k;
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
for (k = 0; k < n; k++)
{
block_mult(a[i][k], b[k][j], c[i][j], nb);
}
}
void parallel_product(block **a, block **b, block **c, int n, int nb)
{
int i, j, k;
#pragma omp parallel private(i, j, k)
#pragma omp single
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
#pragma omp task firstprivate(i, j)
for (k = 0; k < n; k++)
{
block_mult(a[i][k], b[k][j], c[i][j], nb);
}
}

Binary file not shown.

10
BE_OpenMP_2014/pack.sh Normal file
View file

@ -0,0 +1,10 @@
#!/bin/bash
make clean;
mkdir $USER
cp -r responses.txt derivative_free linked_list matrix_multiplication reduction synchronizations $USER;
cd $USER; find . -name "*.pdf" | xargs rm; cd ..;
tar zcvf $USER.tgz $USER;
rm -rf $USER;
mv $USER.tgz /mnt/n7fs/ens/tp_abuttari/rendus_2014
chmod 600 /mnt/n7fs/ens/tp_abuttari/rendus_2014/$USER.tgz

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,32 @@
#include <sys/time.h>
#include <stdlib.h>
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
void operator(int *a, int *b){
mysleep(0.01);
*a += *b;
}

View file

@ -0,0 +1,3 @@
long usecs ();
void operator(int *a, int *b);
void mysleep(double sec);

View file

@ -0,0 +1,79 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "omp.h"
#include "aux.h"
int sequential_reduction(int *x, int n);
int parallel_reduction(int *x, int n);
int main(int argc, char **argv)
{
int n, i, result;
long t_start, t_end;
int *x;
// Command line argument: array length
if (argc == 2)
{
n = atoi(argv[1]); /* the length of the pref */
}
else
{
printf("Usage:\n\n ./main n\n\nwhere n is the length of the array to be used.\n");
return 1;
}
x = (int *)malloc(sizeof(int) * n);
/* Fill the array with random numbers */
srand(1);
for (i = 0; i < n; i++)
x[i] = rand() % n;
/* Sequential reduction */
t_start = usecs();
result = sequential_reduction(x, n);
t_end = usecs();
printf("Sequential time : %8.2f msec. --- Result: %d\n", ((double)t_end - t_start) / 1000.0, result);
/* Fill the array with random numbers */
srand(1);
for (i = 0; i < n; i++)
x[i] = rand() % n;
/* Parallel reduction */
t_start = usecs();
result = parallel_reduction(x, n);
t_end = usecs();
printf("Parallel time : %8.2f msec. --- Result: %d\n", ((double)t_end - t_start) / 1000.0, result);
return 0;
}
int sequential_reduction(int *x, int n)
{
int i;
for (i = 1; i < n; i++)
operator(x, x + i);
return x[0];
}
int parallel_reduction(int *x, int n)
{
int i;
int red = 0;
#pragma omp parallel
#pragma omp for reduction(+ : red)
for (i = 0; i < n; i++)
{
operator(&red, x + i);
}
return red;
}

Binary file not shown.

View file

@ -0,0 +1,43 @@
In this file you have to write the answers to the questions marked
with the "pencil" symbol that you will find in the subjects of the
various exam parts.
===============================================================================
Part 1: Derivative-free minimization
===============================================================================
Part 2: Linked list
===============================================================================
Part 3: Matrix Multiplication
===============================================================================
Part 4: Reduction
=======================================================================
Part 5: Synchronizations

BIN
BE_OpenMP_2014/subject.pdf Normal file

Binary file not shown.

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,31 @@
#include <stdlib.h>
#include <sys/time.h>
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
int func(){
mysleep(0.0000001);
return 1;
}

View file

@ -0,0 +1,4 @@
long usecs ();
int func();
void mysleep(double sec);

View file

@ -0,0 +1,156 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "omp.h"
#include "aux.h"
#define NIT 1000000
#define NE 10000
void sequential(int *data);
void parallel_critical(int *data);
void parallel_atomic(int *data);
void parallel_locks(int *data);
int main(int argc, char **argv)
{
int n, i, j, k, nth, thn, cnt;
long t_start, t_end, save;
double s, z, x, y, nx, ny, nz, mz;
int data[NE];
/* Initialize data */
for (i = 0; i < NE; i++)
data[i] = 0;
t_start = usecs();
sequential(data);
t_end = usecs();
for (cnt = 0, i = 0; i < NE; i++)
{
cnt += data[i];
}
printf("Sequential time : %8.2f msec.", ((double)t_end - t_start) / 1000.0);
printf(" -- result: %4d\n", cnt);
/***********************************************************************/
/***********************************************************************/
/***********************************************************************/
for (i = 0; i < NE; i++)
data[i] = 0;
t_start = usecs();
parallel_critical(data);
t_end = usecs();
for (cnt = 0, i = 0; i < NE; i++)
{
cnt += data[i];
}
printf("Critical time : %8.2f msec.", ((double)t_end - t_start) / 1000.0);
printf(" -- result: %4d\n", cnt);
/***********************************************************************/
/***********************************************************************/
/***********************************************************************/
for (i = 0; i < NE; i++)
data[i] = 0;
t_start = usecs();
parallel_atomic(data);
t_end = usecs();
for (cnt = 0, i = 0; i < NE; i++)
{
cnt += data[i];
}
printf("Atomic time : %8.2f msec.", ((double)t_end - t_start) / 1000.0);
printf(" -- result: %4d\n", cnt);
/***********************************************************************/
/***********************************************************************/
/***********************************************************************/
for (i = 0; i < NE; i++)
{
data[i] = 0;
}
t_start = usecs();
parallel_locks(data);
t_end = usecs();
for (cnt = 0, i = 0; i < NE; i++)
{
cnt += data[i];
}
printf("Locks time : %8.2f msec.", ((double)t_end - t_start) / 1000.0);
printf(" -- result: %4d\n", cnt);
return 0;
}
void sequential(int *data)
{
int i, j;
for (i = 0; i < NIT; i++)
{
j = rand() % NE;
data[j] += func();
}
}
void parallel_critical(int *data)
{
int i, j;
#pragma omp parallel
#pragma omp for
for (i = 0; i < NIT; i++)
{
j = rand() % NE;
#pragma omp critical
data[j] += func();
}
}
void parallel_atomic(int *data)
{
int i, j;
#pragma omp parallel
#pragma omp for
for (i = 0; i < NIT; i++)
{
j = rand() % NE;
#pragma omp atomic update
data[j] += func();
}
}
void parallel_locks(int *data)
{
int i, j;
omp_lock_t *lock = malloc(sizeof(omp_lock_t));
omp_init_lock(lock);
#pragma omp parallel shared(lock)
#pragma omp for
for (i = 0; i < NIT; i++)
{
j = rand() % NE;
omp_set_lock(lock);
data[j] += func();
omp_unset_lock(lock);
}
#pragma omp single
free(lock);
}

Binary file not shown.

Binary file not shown.

13
BE_OpenMP_2019/Makefile Normal file
View file

@ -0,0 +1,13 @@
clean:
(cd band_matrix; make clean)
(cd lu_tasks; make clean)
(cd norm2_noowr; make clean)
(cd stacks; make clean)
(cd tree_branch; make clean)

View file

@ -0,0 +1,23 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,101 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "aux.h"
int ISEED[4] = {0,0,0,1};
int IONE=1;
char NoTran = 'N';
double DONE=1.0, DMONE=-1.0;
double alpha=1.0, beta=1.0;
void init_data(double **A, double **Ac, double *x, int n, int b){
int i, j, k, nbnb;
for(i=0; i<n; i++)
x[i] = ((double)rand())/((double)RAND_MAX);
for(i=0; i<n; i++)
for(j=0; j<n; j++){
if(abs(i-j)<=b){
A[i][j] = ((double)rand())/((double)RAND_MAX);
} else {
A[i][j] = 0.0;
}
}
for(j=0; j<n; j++){
for(i=j-b; i<j+b+1; i++){
if(i<0 || i>=n) {
Ac[i-j+b][j] = 0.0;
} else {
Ac[i-j+b][j] = A[i][j];
}
}
}
/* for(i=0; i<n; i++){ */
/* for(j=0; j<n; j++){ */
/* printf("%.6f ",A[i][j]); */
/* } */
/* printf("\n"); */
/* } */
/* printf("\n"); */
/* for(i=0; i<2*b+1; i++){ */
/* for(j=0; j<n; j++){ */
/* printf("%.6f ",Ac[i][j]); */
/* } */
/* printf("\n"); */
/* } */
/* printf("\n"); */
/* for(i=0; i<n; i++) */
/* printf("%.6f\n",x[i]); */
/* printf("\n"); */
}
void check_result(double *y1, double *y2, int n){
int i;
double mxdif, mx;
mx = 0.0;
mxdif = 0.0;
for(i=0; i<n; i++){
if(abs(y1[i])>mx)
mx = abs(y1[i]);
if(abs(y1[i]-y2[i])>mxdif)
mxdif = abs(y1[i]-y2[i]);
}
if(mxdif/mx>1e-10){
printf("!!! Result is wrong !!!\n");
} else {
printf("Result is correct\n");
}
return;
}
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}

View file

@ -0,0 +1,17 @@
void init_data(double **A, double **Ac, double *x, int n, int b);
long usecs ();
void dgemm_ (char *TRANSA, char *TRANSB,
int *M, int *N, int *K,
double *ALPHA,
double *A, int *LDA,
double *B, int *LDB,
double *BETA,
double *C, int *LDC);
void dlarnv_(int *idist, int *iseed, int *n, double *x);
void matmul_compact_row(double **A, double *x, double *y, int n, int b);
void matmul_compact_diag(double **A, double *x, double *y, int n, int b);
void matmul(double **A, double *x, double *y, int n);
void check_result(double *y1, double *y2, int n);

View file

@ -0,0 +1,142 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "omp.h"
#include "aux.h"
int main(int argc, char **argv)
{
int n, b, i;
long t_start, t_end;
double **A, **Ac, *x, *y1, *y2, *y3;
double diff;
// Command line argument: array length
if (argc == 3)
{
n = atoi(argv[1]); /* the matrix size */
b = atoi(argv[2]); /* the band width */
}
else
{
printf("Usage:\n\n ./main n b\n\nwhere n is the number of blocks in rows and\n");
printf("columns of the matrix and b the band width.\n");
return 1;
}
printf("\n");
A = (double **)malloc(n * sizeof(double *));
Ac = (double **)malloc((2 * b + 1) * sizeof(double *));
x = (double *)malloc(n * sizeof(double));
y1 = (double *)malloc(n * sizeof(double));
y2 = (double *)malloc(n * sizeof(double));
y3 = (double *)malloc(n * sizeof(double));
for (i = 0; i < n; i++)
{
A[i] = (double *)malloc(n * sizeof(double));
}
for (i = 0; i < 2 * b + 1; i++)
{
Ac[i] = (double *)malloc(n * sizeof(double));
}
init_data(A, Ac, x, n, b);
/* Sequential version */
t_start = usecs();
matmul(A, x, y1, n);
t_end = usecs();
printf("Full matrix. Time : %8.2f msec.\n", ((double)t_end - t_start) / 1000.0);
/* Parallel with tasks */
t_start = usecs();
matmul_compact_row(Ac, x, y2, n, b);
t_end = usecs();
printf("Compact by rows. Time : %8.2f msec. ", ((double)t_end - t_start) / 1000.0);
/* Comprare the two resulting vectors */
check_result(y1, y2, n);
/* Parallel with tasks */
t_start = usecs();
matmul_compact_diag(Ac, x, y3, n, b);
t_end = usecs();
printf("Compact by diagonals. Time : %8.2f msec. ", ((double)t_end - t_start) / 1000.0);
check_result(y1, y3, n);
return 0;
}
void matmul(double **A, double *x, double *y, int n)
{
int i, j;
for (i = 0; i < n; i++)
{
y[i] = 0;
for (j = 0; j < n; j++)
{
y[i] += A[i][j] * x[j];
}
}
return;
}
void matmul_compact_row(double **Ac, double *x, double *y, int n, int b)
{
int i, j;
#pragma omp parallel private(i, j)
#pragma omp single
for (i = 0; i < n; i++)
{
y[i] = 0;
}
for (j = 0; j < n; j++)
{
#pragma omp for
for (i = j - b; i < j + b + 1; i++)
{
if (i >= 0 || i < n)
{
y[i] += Ac[i - j + b][j] * x[j];
}
}
}
return;
}
void matmul_compact_diag(double **Ac, double *x, double *y, int n, int b)
{
int i, j;
#pragma omp parallel private(i, j)
#pragma omp single
for (i = 0; i < n; i++)
{
y[i] = 0;
}
for (i = 0; i < 2 * b + 1; i++)
{
#pragma omp for
for (j = 0; j < n; j++)
{
if ((j - b + i) >= 0 && (j - b + i) < n)
{
y[j - b + i] += Ac[i][j] * x[j];
}
}
}
return;
}

Binary file not shown.

View file

@ -0,0 +1,30 @@
CC = gcc
FC = gfortran
LINK = $(FC)
FCFLAGS = -O3 -fopenmp
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
LDFLAGS = $(FCFLAGS)
LBLAS = /mnt/n7fs/ens/tp_abuttari/acml-4.4.0/gfortran64/lib/libacml.a -lm
OBJS = aux.o auxf.o lu_seq.o lu_par_tasks.o kernels.o main.o trace.o
main: clean $(OBJS)
$(LINK) $(LDFLAGS) -o $@ $(OBJS) $(LBLAS)
main_dbg:
($(MAKE) clean main CDEFS=-DDBG)
clean:
(rm -f *.o *.svg main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,10 @@
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}

View file

@ -0,0 +1,28 @@
function dnrm2_c(n, x, incx) bind(c)
use iso_c_binding
integer(c_int), value :: n, incx
real(c_double) :: x(n), dnrm2_c
real(kind(1.d0)) :: dnrm2
dnrm2_c = dnrm2(n, x(1), incx)
return
end function dnrm2_c
function dnrmf_c(m, n, a, lda) bind(c)
use iso_c_binding
integer(c_int), value :: m, n, lda
real(c_double) :: a(lda,n), dnrmf_c
real(kind(1.d0)) :: dlange, w
dnrmf_c = dlange('F', m, n, a(1,1), lda, w)
end function dnrmf_c

View file

@ -0,0 +1,41 @@
typedef struct infostruct{
int B; // The size of block-columns
int NB; // The number of block-columns in a matrix
int N; // The total size of the matrix N=B*NB
int *ipiv; // Permutation vector for numerical pivoting
} info_type;
typedef double** Matrix;
/* typedef struct matstruct{ */
/* double **BC; // Permutation vector for numerical pivoting */
/* } Matrix; */
typedef enum {PNL = 0, UPD, END, NONE} Type;
void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
void dtrsm_(char *side, char *uplo, char *transa, char *diag,
int *m, int *n, const double *alpha, const double *A, int *lda,
double *B, int *ldb);
void dlaswp_(int *n, double * a, int *lda, int *k1, int *k2, int *ipiv, int *incx);
void dlarnv_(int *idist, int *iseed, int *n, double *x);
void dgetrs_(char *t, int *n, int *nrhs, double *A, int *lda, int *ipiv, double *x, int *incx, int *info);
void dgemv_(char *t, int *m, int *n, const double *alpha, const double *A, int *lda, const double *x, int *incx, const double *beta, double *y, int *incy);
double dnrm2_c(int n, double *x, int incx);
double dnrmf_c(int m, int n, double *A, int lda);
void dgemm_(char *ta, char *tb, int *m, int *n, int *k, const double *alpha, const double *A, int *lda, const double *B, int *ldB, const double *beta, const double *c, int *ldc);
void panel(double *P, int k, info_type info );
void update(double *P, double *U, int k, int j, info_type info);
void backperm(Matrix A, info_type info);
void checkres(double *A, double *Acpy, info_type info);
long usecs ();
void lu_seq (Matrix A, info_type info);
void lu_par_tasks(Matrix A, info_type info);

View file

@ -0,0 +1,114 @@
#include "trace.h"
#include "common.h"
#include <math.h>
#include <omp.h>
#include <stdlib.h>
void panel(double *P, int k, info_type info){
int m, ld, err, np, i;
i = k*info.B;
np = info.B;
m = info.N-i;
ld = info.N;
#if defined(DBG)
printf("%2d -- panel : %d\n",omp_get_thread_num(),k);
#endif
trace_event_start(PNL);
dgetrf_(&m, &np, P+i, &ld, info.ipiv+i, &err);
trace_event_stop(PNL);
}
void update(double *P, double *U, int k, int j, info_type info){
int m, mu, nu, np, ld, err, i;
char NoTran = 'N', Lower='L', Unit='U', Left='L';
int IONE=1, IMONE=-1, IZERO=0;
double DONE=(double)1.0, DMONE=-1.0, DZERO=0.0;
i = k*info.B;
np = info.B;
nu = info.B;
m = info.N-i;
mu = m-np;
ld = info.N;
#if defined(DBG)
printf("%2d -- update: %d %d\n",omp_get_thread_num(),k,j);
#endif
trace_event_start(UPD);
dlaswp_(&nu, U+i, &ld, &IONE, &np, info.ipiv+i, &IONE);
dtrsm_ (&Left, &Lower, &NoTran, &Unit,
&np, &nu,
&DONE,
P+i, &ld,
U+i, &ld);
dgemm_ (&NoTran, &NoTran,
&mu, &nu, &np,
&DMONE,
P+i+np, &ld,
U+i, &ld,
&DONE,
U+i+np, &ld);
trace_event_stop(UPD);
return;
}
void backperm(Matrix A, info_type info){
int i, j, ld, ipb, ipo;
int IONE=1;
ld = info.N;
trace_event_start(END);
for(i=info.B; i<info.N; i+=info.B){
for(j=i; j<info.N; j++)
info.ipiv[j]+=info.B;
ipo = i+1;
ipb = i+info.B;
#pragma omp parallel for
for(j=0; j<i/info.B; j+=1)
dlaswp_(&info.B, A[j], &ld, &ipo, &ipb, info.ipiv, &IONE);
}
trace_event_stop(END);
return;
}
void checkres(double *A, double *Acpy, info_type info){
int ISEED[4] = {0,0,0,3};
int IONE=1;
char NoTran = 'N';
double DONE=1.0, DMONE=-1.0;
double *x, *b;
int err;
double flops, nrm2, nrmf;
x = (double *)malloc(info.N*sizeof(double));
b = (double *)malloc(info.N*sizeof(double));
dlarnv_(&IONE, ISEED, &info.N, b);
memcpy( x, b, info.N*sizeof(double) );
dgetrs_(&NoTran, &(info.N), &IONE, A, &(info.N), info.ipiv, x, &(info.N), &err);
dgemv_(&NoTran, &info.N, &info.N, &DONE, Acpy, &info.N, x, &IONE, &DMONE, b, &IONE);
nrm2 = dnrm2_c(info.N, b, IONE);
nrmf = dnrmf_c(info.N, info.N, Acpy, info.N);
printf("Residual norm: %e\n",sqrt(info.N)*nrm2/nrmf);
free(x);
free(b);
return;
}

View file

@ -0,0 +1,44 @@
#include "trace.h"
#include "common.h"
#include <omp.h>
/* This is a sequential routine for the LU factorization of a square
matrix in block-columns */
void lu_par_tasks(Matrix A, info_type info)
{
int i, j;
trace_init();
#pragma omp parallel
{
#pragma single
{
//Un seul processus génère toutes les tâchs pour tout le monde
for (i = 0; i < info.NB; i++)
{
/* Do the panel */
#pragma omp task depend(inout \
: A[i]) firstprivate(i)
//firstprivate pour initialiser à la valeur qu'on veut.
panel(A[i], i, info);
for (j = i + 1; j < info.NB; j++)
{
/* Do all the correspondint updates */
#pragma omp task depend(in \
: A[i]) depend(inout \
: A[j]) firstprivate(i, j)
update(A[i], A[j], i, j, info);
}
}
}
}
/* Do row permutations resulting from the numerical pivoting */
/* This operation can be ignored and should be left out of the parallel region */
backperm(A, info);
trace_dump("trace_par_tasks.svg");
return;
}

View file

@ -0,0 +1,31 @@
#include "trace.h"
#include "common.h"
/* This is a sequential routine for the LU factorization of a square
matrix in block-columns */
void lu_seq(Matrix A, info_type info){
int i, j;
trace_init();
for(i=0; i<info.NB; i++){
/* Do the panel */
panel(A[i], i, info);
for(j=i+1; j<info.NB; j++){
/* Do all the correspondint updates */
update(A[i], A[j], i, j, info);
}
}
/* Do row permutations resulting from the numerical pivoting */
backperm(A, info);
trace_dump("trace_seq.svg");
return;
}

View file

@ -0,0 +1,86 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "common.h"
int main(int argc, char **argv){
int N, NB, B, NN;
double flops, nrm2, nrmf;
double *Acpy1, *Acpy2, *x, *b;
Matrix A;
info_type info;
long t_start,t_end;
int err, nth, i, j;
int ISEED[4] = {0,0,0,1};
int IONE=1;
char NoTran = 'N', *nt;
double DONE=1.0, DMONE=-1.0;
if(argc != 3){
printf("Usage:\n\n./main B NB\n\nwhere B is the size of block-columns and \n\
NB is the number of block-columns the matrix is made of.\n");
return 1;
}
B = atoi(argv[1]); /* block size */
NB = atoi(argv[2]); /* dimension in blocks */
N = B*NB;
NN = N*N;
nt = getenv("OMP_NUM_THREADS");
flops = ((double)2.0*(double) N)*((double) N)*((double) N)/3.0;
Acpy1 = (double *)malloc(NN*sizeof(double));
Acpy2 = (double *)malloc(NN*sizeof(double));
info.ipiv = (int *)malloc(N*sizeof(int));
dlarnv_(&IONE, ISEED, &NN, Acpy1);
info.B = B;
info.NB = NB;
info.N = N;
printf("Matrix size: %d\n",N);
A = (double **)malloc(info.NB*sizeof(double*));
/* get pointers to block-columns */
for (j=0; j<info.NB; j+=1)
A[j] = Acpy2+j*info.B*info.N;
printf("\n========== Sequential (1 threads) ==========\n" );
memcpy( Acpy2, Acpy1, N*N*sizeof(double) );
t_start = usecs();
lu_seq(A, info);
t_end = usecs();
printf("Time (msec.) : %7.1f\n",(t_end-t_start)/1e3);
printf("Gflop/s : %7.1f\n",flops/(t_end-t_start)/1e3);
checkres(Acpy2, Acpy1, info);
printf("\n========== Tasks Parallel (%1s threads) ==========\n",nt);
memcpy( Acpy2, Acpy1, N*N*sizeof(double) );
t_start = usecs();
lu_par_tasks(A, info);
t_end = usecs();
printf("Time (msec.) : %7.1f\n",(t_end-t_start)/1e3);
printf("Gflop/s : %7.1f\n",flops/(t_end-t_start)/1e3);
checkres(Acpy2, Acpy1, info);
return 0;
}

Binary file not shown.

View file

@ -0,0 +1,120 @@
#include "trace.h"
#include <omp.h>
Event events[MAXTHREADS][MAXEVENTS];
int nevents[MAXTHREADS];
void trace_init(){
int i;
#pragma omp master
{
/* strcpy(colors[0], "#d38d5f"); */
/* strcpy(colors[1], "#ffdd55"); */
/* strcpy(colors[2], "#8dd35f"); */
/* strcpy(colors[3], "#80b3ff"); */
/* strcpy(colors[4], "#e580ff"); */
for(i=0; i<MAXTHREADS; i++)
nevents[i]=0;
t_zero = usecs();
}
return;
}
void trace_event_start(int type){
int iam;
iam = omp_get_thread_num();
(events[iam][nevents[iam]]).t_start = usecs();
(events[iam][nevents[iam]]).type = type;
return;
}
void trace_event_stop(int type){
int iam;
iam = omp_get_thread_num();
(events[iam][nevents[iam]]).t_stop = usecs();
nevents[iam] +=1;
return;
}
void trace_dump(char *fname){
int t, e, i, nth;
long offs, t_stop;
Event ev;
FILE * pFile;
char *colors[] = { "#d38d5f", "#ffdd55", "#8dd35f", "#80b3ff", "#e580ff"};
double scale_x, scale_y, x;
#pragma omp master
{
t_stop = usecs();
nth = 0;
for(i=0; i<MAXTHREADS; i++)
if(nevents[i] > 0) nth=i+1;
/* scale_x = 1000.0; */
/* scale_y = ((double)(nth+1)) *30000 / ((double)(t_stop-t_zero)); */
scale_x = ((double)(t_stop-t_zero))/1000.0;
scale_y = 0.1;
pFile = fopen (fname,"w");
fprintf(pFile,"<svg x=\"-11\" y=\"-8\" width=\"%f\" height=\"%f\">\n",
((double)t_stop-t_zero)*1.06/scale_x, ((double)nth+1)*1.05/scale_y+8);
for(t=0; t<MAXTHREADS; t++){
for(e=0; e<nevents[t]; e++){
ev = events[t][e];
fprintf(pFile," "
"<rect x=\"%f\" y=\"%f\" width=\"%f\" height=\"%f\" "
"fill=\"%7s\" stroke-width=\"0\"/>\n",
(double)(ev.t_start-t_zero)/scale_x,
((double)t)/scale_y,
((double)(ev.t_stop-ev.t_start))/scale_x,
((double)1.0)/scale_y,
colors[ev.type]);
}
}
fprintf(pFile, "<line x1=\"0\" y1=\"%f\" x2=\"%f\" y2=\"%f\" \
style=\"stroke:rgb(0,0,0);stroke-width:1\"/>\n",
((double)nth+1)*1.01/scale_y,
((double)t_stop-t_zero)*1.02/scale_x,
((double)nth+1)*1.01/scale_y);
for(t=0; t<nth; t++){
fprintf(pFile,"<text x=\"%f\" y=\"%f\" style=\"font-size:6pt\">Thread %d</text>\n",
((double)t_stop-t_zero)*1.01/scale_x, ((double)t+0.8)/scale_y, t);
}
fprintf(pFile,"<text x=\"0\" y=\"%f\" style=\"font-size:6pt\">Time (usec.)</text>\n",((double)nth+1)*1.05/scale_y+7);
for(x=((double)t_stop-t_zero)/scale_x/10.0; x<=((double)t_stop-t_zero)/scale_x; x+=((double)t_stop-t_zero)/scale_x/10.0){
fprintf(pFile, "<line x1=\"%f\" y1=\"%f\" x2=\"%f\" y2=\"%f\" \
style=\"stroke:rgb(0,0,0);stroke-width:0.5\"/>\n",
x,((double)nth+1)*1.01/scale_y-1, x, ((double)nth+1)*1.01/scale_y+1);
fprintf(pFile,"<text x=\"%f\" y=\"%f\" style=\"font-size:6pt\"> %.0f </text>\n",
x,((double)nth+1)*1.01/scale_y+7,x*scale_x);
}
fprintf(pFile,"</svg>\n");
}
}

View file

@ -0,0 +1,22 @@
#include <string.h>
#include <stdio.h>
long t_zero;
#define MAXEVENTS 10000
#define MAXTHREADS 48
long usecs ();
typedef struct event_struct{
int type;
long t_start, t_stop;
} Event;
void trace_init();
void trace_event_start(int type);
void trace_event_stop(int type);
void trace_dump(char *);

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,136 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>
long usecs()
{
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec * 1000000 + t.tv_usec;
}
double dnorm2_seq(double *x, int n)
{
int i;
double res, scale, ssq, absxi;
scale = 0.0;
ssq = 1.0;
for (i = 0; i < n; i++)
{
if (x[i] != 0.0)
{
absxi = fabs(x[i]);
if (scale < absxi)
{
ssq = 1.0 + ssq * pow(scale / absxi, 2);
scale = absxi;
}
else
{
ssq = ssq + pow(absxi / scale, 2);
}
}
}
res = scale * sqrt(ssq);
return res;
}
double dnorm2_par(double *x, int n)
{
int i;
double res, scale, ssq, absxi, myssq, myscale;
scale = 0.0;
ssq = 1.0;
#pragma omp parallel private(myssq, myscale, absxi)
#pragma omp for
for (i = 0; i < n; i++)
{
if (x[i] != 0.0)
{
absxi = fabs(x[i]);
if (myscale < absxi)
{
myssq = 1.0 + myssq * pow(myscale / absxi, 2);
myscale = absxi;
}
else
{
myssq = myssq + pow(absxi / myscale, 2);
}
}
#pragma omp critical
{
if (myscale > scale)
{
ssq = 1.0 + myssq * pow(scale / myscale, 2);
scale = myscale;
}
else
{
ssq = ssq + pow(myssq / scale, 2);
}
}
}
res = scale * sqrt(ssq);
return res;
}
int main(int argc, char *argv[])
{
int n, i;
double *x;
double n2_seq, n2_par;
long t_start, t_end;
if (argc != 2)
{
printf("Wrong number of arguments.\n Usage:\n\n\
./main n \n\n where n is the size of the vector x whose 2-norm has to be computed.\n");
return 1;
}
sscanf(argv[1], "%d", &n);
x = (double *)malloc(sizeof(double) * n);
for (i = 0; i < n; i++)
x[i] = ((double)500.0 * rand() / (RAND_MAX));
printf("\n================== Sequential version ==================\n");
t_start = usecs();
n2_seq = dnorm2_seq(x, n);
t_end = usecs();
printf("Time (msec.) : %7.1f\n", (t_end - t_start) / 1e3);
printf("Computed norm is: %10.3lf\n", n2_seq);
printf("\n\n=========== Parallel version with reduction ===========\n");
t_start = usecs();
n2_par = dnorm2_par(x, n);
t_end = usecs();
printf("Time (msec.) : %7.1f\n", (t_end - t_start) / 1e3);
printf("Computed norm is: %10.3lf\n", n2_par);
printf("\n\n");
if (fabs(n2_seq - n2_par) / n2_seq > 1e-10)
{
printf("The parallel version is numerically wrong! \n");
}
else
{
printf("The parallel version is numerically okay!\n");
}
return 0;
}

Binary file not shown.

8
BE_OpenMP_2019/pack.sh Normal file
View file

@ -0,0 +1,8 @@
#!/bin/bash
make clean;
mkdir $USER
cp -r responses.txt band_matrix lu_tasks norm2_noowr stacks tree_branch $USER;
cd $USER; find . -name "*.pdf" | xargs rm; cd ..;
tar zcvf $USER.tgz $USER;
rm -rf $USER;

View file

@ -0,0 +1,43 @@
In this file you have to write the answers to the questions marked
with the "pencil" symbol that you will find in the subjects of the
various exam parts.
=======================================================================
Part 1: Banded matrix product
=======================================================================
Part 2: LU factorization with tasks
=======================================================================
Part 3: Norm2 without overflow
=======================================================================
Part 4: Stacks
=======================================================================
Part 5: Longest branch of a tree

View file

@ -0,0 +1,27 @@
CC = gcc
LINK = $(CC)
# CFLAGS = -O3 -fopenmp -Wunknown-pragmas
CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm $(LBLAS)
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

166
BE_OpenMP_2019/stacks/aux.c Normal file
View file

@ -0,0 +1,166 @@
#include "aux.h"
int *stacks_list;
int *stacks_cnts;
int cnt, cnt2;
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
void init_stacks(stack_t **stacks, int n){
int i;
*stacks = (stack_t*)malloc(n*sizeof(stack_t));
for(i=0; i<n; i++){
(*stacks)[i].cnt = 0;
(*stacks)[i].elems = (int*)malloc(MAXELEMS*sizeof(int));
}
stacks_list = (int*)malloc(MAXELEMS*sizeof(int));
stacks_cnts = (int*)malloc(MAXELEMS*sizeof(int));
for(i=0; i<MAXELEMS; i++){
stacks_cnts[i] = 0;
stacks_list[i] = rand()%n;
}
cnt = 0;
cnt2 = 0;
}
void free_stacks(stack_t **stacks, int n){
int i;
for(i=0; i<n; i++){
(*stacks)[i].cnt = 0;
free((*stacks)[i].elems);
}
free(*stacks);
free(stacks_list);
free(stacks_cnts);
cnt = 0;
}
int get_random_stack(){
int c;
#pragma omp atomic capture
c = cnt++;
if(c >= MAXELEMS){
return -1;
} else {
return stacks_list[c];
}
}
int process(){
int c;
mysleep(0.0001);
#pragma omp atomic capture
c = cnt2++;
return c;
}
void check_result(stack_t *stacks, int n){
int i, j;
int *check;
/* for(i=0; i<n; i++){ */
/* for(j=0; j<stacks[i].cnt; j++){ */
/* if(stacks[i].elems[j] != j){ */
/* printf("The result is false\n"); */
/* return; */
/* } */
/* } */
/* if(stacks[i].cnt != stacks_cnts[i]){ */
/* printf("The result is false\n"); */
/* return; */
/* } */
/* } */
/* for(i=0; i<MAXELEMS; i++) */
/* stacks_cnts[stacks_list[i]]--; */
/* for(i=0; i<n; i++){ */
/* if(stacks_cnts[i] != 0){ */
/* printf("The result is false\n"); */
/* return; */
/* } */
/* } */
for(i=0; i<n; i++)
stacks_cnts[i] = stacks[i].cnt;
for(i=0; i<MAXELEMS; i++)
stacks_cnts[stacks_list[i]]--;
for(i=0; i<n; i++){
if(stacks_cnts[i] != 0){
printf("The result is false\n");
return;
}
}
check = (int*)malloc(MAXELEMS*sizeof(int));
for(i=0; i<MAXELEMS; i++)
check[i] = 0;
for(i=0; i<n; i++)
for(j=0; j<stacks[i].cnt; j++)
check[stacks[i].elems[j]] = 1;
for(i=0; i<MAXELEMS; i++)
if(check[i] != 1){
free(check);
printf("The result is false\n");
return;
}
free(check);
printf("The result is correct!!!\n");
}

View file

@ -0,0 +1,23 @@
#define MAXELEMS 1000
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct stackstruct{
int cnt;
int *elems;
} stack_t;
void init_stacks(stack_t **stacks, int n);
void free_stacks(stack_t **stacks, int n);
int get_random_stack();
int process();
void check_result(stack_t *stacks, int n);
long usecs ();
void mysleep(double sec);

View file

@ -0,0 +1,158 @@
#include "aux.h"
#include <omp.h>
void stacks_seq(stack_t *stacks, int n);
void stacks_par_critical(stack_t *stacks, int n);
void stacks_par_atomic(stack_t *stacks, int n);
void stacks_par_locks(stack_t *stacks, int n);
int main(int argc, char **argv)
{
stack_t *stacks;
int i, j, n;
long t_start, t_end, save;
if (argc == 2)
{
n = atoi(argv[1]); /* the number of stacks */
}
else
{
printf("Usage:\n\n ./main n\n\nwhere n is the number of stacks.\n");
return 1;
}
printf("\n");
init_stacks(&stacks, n);
t_start = usecs();
stacks_seq(stacks, n);
t_end = usecs();
printf("Sequential version. -------- time : %8.2f msec. ", ((double)t_end - t_start) / 1000.0);
check_result(stacks, n);
free_stacks(&stacks, n);
init_stacks(&stacks, n);
t_start = usecs();
stacks_par_critical(stacks, n);
t_end = usecs();
printf("Critical version. -------- time : %8.2f msec. ", ((double)t_end - t_start) / 1000.0);
check_result(stacks, n);
free_stacks(&stacks, n);
init_stacks(&stacks, n);
t_start = usecs();
stacks_par_atomic(stacks, n);
t_end = usecs();
printf("Atomic version. -------- time : %8.2f msec. ", ((double)t_end - t_start) / 1000.0);
check_result(stacks, n);
free_stacks(&stacks, n);
init_stacks(&stacks, n);
t_start = usecs();
stacks_par_locks(stacks, n);
t_end = usecs();
printf("Locks version. -------- time : %8.2f msec. ", ((double)t_end - t_start) / 1000.0);
check_result(stacks, n);
free_stacks(&stacks, n);
return 0;
}
void stacks_seq(stack_t *stacks, int n)
{
int s;
for (;;)
{
/* Get the stack number s */
s = get_random_stack();
if (s == -1)
break;
/* Push some value on stack s */
stacks[s].elems[stacks[s].cnt++] = process();
}
}
void stacks_par_critical(stack_t *stacks, int n)
{
int s, tmp;
#pragma omp parallel private(s, tmp)
for (;;)
{
/* Get the stack number s */
s = get_random_stack();
if (s == -1)
break;
/* Push some value on stack s */
// On sort la fonction process de la section critique parce que cela ne pose pas de problème/
tmp = process();
#pragma omp critical
stacks[s].elems[stacks[s].cnt++] = tmp;
}
}
void stacks_par_atomic(stack_t *stacks, int n)
{
int s, tmp;
#pragma omp parallel private(s, tmp)
for (;;)
{
/* Get the stack number s */
s = get_random_stack();
if (s == -1)
break;
#pragma omp atomic capture
tmp = stacks[s].cnt++; // On réserve la pile
/* Push some value on stack s */
stacks[s].elems[tmp] = process();
}
}
void stacks_par_locks(stack_t *stacks, int n)
{
int s, tmp;
omp_lock_t* lock = malloc(1000 * sizeof(omp_lock_t));
for (int i = 0; i < 1000; i++)
{
omp_init_lock(lock + i);
}
#pragma omp parallel private(s, tmp)
for (;;)
{
/* Get the stack number s */
s = get_random_stack();
if (s == -1)
break;
tmp = process();
/* Push some value on stack s */
omp_set_lock(lock + s);
stacks[s].elems[stacks[s].cnt++] = tmp;
omp_unset_lock(lock + s);
}
for (int i = 0; i < 1000; i++)
{
omp_destroy_lock(lock + i);
}
}

Binary file not shown.

BIN
BE_OpenMP_2019/subject.pdf Normal file

Binary file not shown.

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -g -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = _aux.o longest_branch_seq.o longest_branch_par.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,119 @@
#include "_aux.h"
#include <stdio.h>
#include <omp.h>
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
void treeinitrec(node_t *root, int l, int *id){
int i;
if (l==-1)
return;
root->id = ++(*id);
root->nc = rand()%l;
root->children = (node_t*)malloc(root->nc*sizeof(node_t));
root->weight = rand()%50+1;
/* root->branch_weight = 0; */
for(i=0; i<root->nc; i++)
treeinitrec(root->children+i, l-1, id);
return ;
}
void treeinit(node_t *root, int l){
int id;
id = 0;
treeinitrec(root, l, &id);
treeprint(root, "tree.dot");
return ;
}
void treeprintrec(node_t *root, FILE *pfile){
int i;
fprintf(pfile, "node%4.4d[label=\"id:%d\\lwg:%d\\l\"];\n",root->id,root->id, root->weight);
for(i=0; i<root->nc; i++)
fprintf(pfile, "node%4.4d -- node%4.4d\n",root->id,root->children[i].id);
for(i=0; i<root->nc; i++)
treeprintrec(root->children+i, pfile);
return;
}
void treeprint(node_t *root, char *fname){
FILE *pfile;
pfile = fopen (fname,"w");
fprintf(pfile, "graph G {\n");
fprintf(pfile, "node [color=black,\n");
fprintf(pfile, "fillcolor=white,\n");
fprintf(pfile, "shape=circle,\n");
fprintf(pfile, "fontname=Courier,\n");
fprintf(pfile, "style=filled\n");
fprintf(pfile, "];\n");
treeprintrec(root, pfile);
fprintf(pfile, "}");
fclose(pfile);
}
int process(node_t *node){
/* printf("%2d ---> %4d\n",omp_get_thread_num(), node->id); */
mysleep(((double)node->weight)/10000.0);
return 0;
}

View file

@ -0,0 +1,22 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
typedef struct nodestruct{
unsigned int weight, branch_weight;
unsigned int id, nc;
struct nodestruct *children;
} node_t;
long usecs ();
void treeinit(node_t *root, int l);
int process(node_t *node);
void treeprint(node_t *root, char *fname);
void longest_branch_seq(node_t *root, unsigned int *weight, unsigned int *leaf);
void longest_branch_seq_rec(node_t *root, unsigned int *weight, unsigned int *leaf);
void longest_branch_par(node_t *root, unsigned int *weight, unsigned int *leaf);
void longest_branch_par_rec(node_t *root, unsigned int *weight, unsigned int *leaf, int * updates);

View file

@ -0,0 +1,53 @@
#include "_aux.h"
#include "omp.h"
void longest_branch_par(node_t *root, unsigned int *longest_branch_weight, unsigned int *longest_branch_leaf){
*longest_branch_weight = 0;
*longest_branch_leaf = -1;
root->branch_weight = 0;
int *updates = (int*) calloc(omp_get_max_threads(),sizeof(int));
#pragma omp parallel
{
#pragma omp single
{
longest_branch_par_rec(root, longest_branch_weight, longest_branch_leaf, updates);
}
}
for (int thread = 0; thread < omp_get_max_threads(); thread++) {
printf("%d ",updates[thread]);
}
printf("\n");
free(updates);
}
void longest_branch_par_rec(node_t *root, unsigned int *longest_branch_weight, unsigned int *longest_branch_leaf, int * updates){
int i;
process(root);
root->branch_weight += root->weight;
if(root->nc>0) {
for(i=0; i<root->nc; i++){
#pragma omp task firstprivate(i)
{
root->children[i].branch_weight = root->branch_weight;
updates[omp_get_thread_num()] += 1;
longest_branch_par_rec(root->children+i, longest_branch_weight, longest_branch_leaf,updates);
}
}
} else {
if(root->branch_weight > *longest_branch_weight){
*longest_branch_weight = root->branch_weight;
*longest_branch_leaf = root->id;
}
}
}

View file

@ -0,0 +1,37 @@
#include "_aux.h"
void longest_branch_seq(node_t *root, unsigned int *longest_branch_weight, unsigned int *longest_branch_leaf){
*longest_branch_weight = 0;
*longest_branch_leaf = -1;
root->branch_weight = 0;
longest_branch_seq_rec(root, longest_branch_weight, longest_branch_leaf);
}
void longest_branch_seq_rec(node_t *root, unsigned int *longest_branch_weight, unsigned int *longest_branch_leaf){
int i;
process(root);
root->branch_weight += root->weight;
if(root->nc>0) {
for(i=0; i<root->nc; i++){
root->children[i].branch_weight = root->branch_weight;
longest_branch_seq_rec(root->children+i, longest_branch_weight, longest_branch_leaf);
}
} else {
if(root->branch_weight > *longest_branch_weight){
*longest_branch_weight = root->branch_weight;
*longest_branch_leaf = root->id;
}
}
}

View file

@ -0,0 +1,63 @@
#include "_aux.h"
void longest_branch_seq(node_t *root, unsigned int *longest_branch_weight, unsigned int *longest_branch_leaf);
void longest_branch_seq_rec(node_t *root, unsigned int *longest_branch_weight, unsigned int *longest_branch_leaf);
void longest_branch_par(node_t *root, unsigned int *longest_branch_weight, unsigned int *longest_branch_leaf);
void longest_branch_par_rec(node_t *root, unsigned int *longest_branch_weight, unsigned int *longest_branch_leaf, int * updates);
int main(int argc, char **argv) {
int n;
int i, l, r, s;
long ts, te;
node_t root;
unsigned int longest_branch_weight, longest_branch_leaf;
if ( argc == 3 ) {
l = atoi(argv[1]); /* the number of levels in the tree */
s = atoi(argv[2]); /* the seed for the random number generation */
} else {
printf("Usage:\n\n ./main l s\n\nwhere l is the number of levels in the tree and s the seed for the random number generation.\n");
return 1;
}
/* Chage this to generate different trees */
srand(s);
treeinit(&root, l);
printf("==================================================\n\n");
printf("Starting sequential execution\n");
ts = usecs();
longest_branch_seq(&root, &longest_branch_weight, &longest_branch_leaf);
te = usecs()-ts;
printf("Sequential execution time: %6ld msec.\n",te/1000);
printf("Sequential weight : %d\n",longest_branch_weight);
printf("Sequential leaf : %d\n",longest_branch_leaf);
printf("==================================================\n\n");
printf("Starting parallel execution\n");
ts = usecs();
longest_branch_par(&root, &longest_branch_weight, &longest_branch_leaf);
te = usecs()-ts;
printf("Parallel execution time : %6ld msec.\n",te/1000);
printf("Parallel weight : %d\n",longest_branch_weight);
printf("Parallel leaf : %d\n",longest_branch_leaf);
return 0;
}

Binary file not shown.

View file

@ -0,0 +1,174 @@
\documentclass{article}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{marvosym}
\usepackage{dingbat}
\usepackage{tikz}
\title{Longest branch of a tree}
\date{}
\usetikzlibrary{arrows}
\tikzset{
treenode/.style = {align=center, inner sep=0pt, text centered, font=\sffamily},
wn/.style = {treenode, circle, black, font=\ttfamily, draw=white, fill=white, text width=1.5em},
txt/.style = {black, anchor=west, font=\ttfamily, draw=white, fill=white},
edge from parent/.style={thick,draw=black, latex-}
}
\begin{document}
\maketitle
\section{Longest tree branch}
This exercise is about parallelizing a Depth First Search (DFS) traversal of
a random tree. We assume that each node has a weight which corresponds
to the time it takes to process it. Our DFS traversal finds the
longest branch in the tree, i.e., the branch such that the sum of the
weigths of its nodes is maximum.
The tree is nodes of type \texttt{node\_t} which contain the following members
\begin{itemize}
\item \texttt{weight}: the weight of the node;
\item \texttt{branch\_weight}: the weight of the branch that connects
the node to the root of the tree; this is set to zero at the
beginning and is updated during the DFS traversal;
\item \texttt{id}: the node number;
\item \texttt{nc}: the number of children of the node.
\item \texttt{*children}: an array of pointers to the children of the node.
\end{itemize}
The traversal is done recursively using the following code
\begin{verbatim}
void longest_branch_seq_rec(node_t *root, unsigned int *weight, unsigned int *leaf){
int i;
process(root);
root->branch_weight += root->weight;
if(root->nc>0) {
for(i=0; i<root->nc; i++){
root->children[i].branch_weight = root->branch_weight;
longest_branch_seq_rec(root->children+i, weight, leaf);
}
} else {
if(root->branch_weight > *weight){
*weight = root->branch_weight;
*leaf = root->id;
}
}
}
\end{verbatim}
The \texttt{weight} and \texttt{leaf} arguments of this function are
meant to return the weight of the longest branch and the corresponding
leaf. When we visit a node, first we process it using the
\texttt{process} routine and we update the weight of the branch that
connects it to the root (i.e., we add its weight to the branch weight
of its father). Then, if it has children, we recursively call this
code on each one of them, if not then it means that we have reached a
leaf of the tree; in this case if the weight of the current branch is
grater than the current maximum contained in the \texttt{weight}
variable, we update \texttt{weight} and \texttt{leaf}.
For example, on the tree below, this method would return the leaf
number $3$ and the associated weight of $137$.
\begin{center}
\includegraphics[width=0.4\textwidth]{tree.pdf}
\end{center}
\section{Package content}
In the \texttt{tree\_branch} directory you will find the
following files:
\begin{itemize}
\item \texttt{main.c}: this file contains the main program which first
initializes the tree for a provided number of maximum levels. The main
program then calls a sequential routine \texttt{longest\_branch\_seq}
containing the above code, then calls the
\texttt{longest\_branch\_par} routine which is supposed to contain a
parallel version of the traversal code.
\item \texttt{longest\_branch\_seq.c}: contains a routine implementing a
sequential traversal with the code presented above.
\item \texttt{longest\_branch\_par.c} contains a routine implementing a
parallel tree traversal. \textbf{Only this file has to be modified
for this exercise}.
\item \texttt{aux.c, aux.h}: these two files contain auxiliary
routines and \textbf{must not be modified}.
\end{itemize}
The code can be compiled with the \texttt{make} command: just type
\texttt{make} inside the \texttt{tree\_branch} directory; this
will generate a \texttt{main} program that can be run like this:
\begin{verbatim}
$ ./main l s
\end{verbatim}
where \texttt{l} is the number of levels in the tree. The argument $s$
is the seed for the random number generation (which is used to build
the tree), and can be used to create trees of different shapes for a
fixed number of levels.
\section{Assignment}
\begin{itemize}
\item {\huge \Keyboard} At the beginning, the
\texttt{longest\_branch\_par} routine contains an exact copy of the
\texttt{longest\_branch\_seq} one. Modify these routine in order to
parallelize it. Make sure that the result computed by the three
routines (sequential and parallel ones) is consistently (that is, at
every execution of the parallel code) the same; a message printed at
the end of the execution will tell you whether this is the
case. Note that there may be multiple branches of the same length;
in this case any of them will be considered a correct result. Also,
modify the code in order to count the number of nodes updated by
each of the working threads.
\item \smallpencil Report the execution times for the implemented
parallel version with 1, 2 and 4 threads and for different tree
sizes. Analyze and comment on your results: is the achieved speedup
reasonable or not? Report your answer in the \texttt{responses.txt}
file.
\end{itemize}
\paragraph{Advice}
\begin{itemize}
\item As usual, when developing and debugging choose trees of small
size. When evaluating performance it's better to choose a larges
tree size.
\end{itemize}
\end{document}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:

File diff suppressed because it is too large Load diff

3
BE_OpenMP_2021/.vscode/settings.json vendored Normal file
View file

@ -0,0 +1,3 @@
{
"editor.formatOnSave": false
}

13
BE_OpenMP_2021/Makefile Normal file
View file

@ -0,0 +1,13 @@
clean:
(cd butterfly; make clean)
(cd neural_network; make clean)
(cd norm2; make clean)
(cd pipelining; make clean)
(cd tree_bottomup; make clean)

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,72 @@
#include "aux.h"
void generate_array(int n, int **array, int *res){
int i;
*res = 0;
*array = (int*)malloc(n*sizeof(int));
for(i=0; i<n; i++){
(*array)[i] = rand()%10;
*res+=(*array)[i];
}
}
int operator(int a, int b){
mysleep(0.0001);
return a+b;
}
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
void check_result(int n, int *array, int res){
int i;
int ok = 1;
for(i=0; i<n; i++)
ok = ok && (array[i]==res);
if(ok){
printf("The result is CORRECT\n");
} else {
printf("The result is WRONG!!!\n");
}
}

View file

@ -0,0 +1,17 @@
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <math.h>
#include <sys/time.h>
#define MIN(a,b) (((a)<(b))?(a):(b))
int operator(int a, int b);
void generate_array(int n, int **array, int *res);
void mysleep(double t);
long usecs ();
void check_result(int n, int *array, int res);

View file

@ -0,0 +1,146 @@
#include "aux.h"
void butterfly_seq(int n, int l, int *array);
void butterfly_par(int n, int l, int *array);
int main(int argc, char **argv)
{
long t_start, t_end;
int i, n, l, res;
int *array_seq, *array_par;
if (argc == 2)
{
l = atoi(argv[1]);
}
else
{
printf("Usage:\n\n ./main l\n\nwhere l defines the size of the array n=2^l.\n");
return 1;
}
n = pow(2, l);
printf("\nGenerating an array with %d elements\n", n);
generate_array(n, &array_seq, &res);
array_par = (int *)malloc(n * sizeof(int));
for (i = 0; i < n; i++)
array_par[i] = array_seq[i];
if (n <= 32)
{
printf("array_seq=[");
for (i = 0; i < n; i++)
{
printf("%d ", array_seq[i]);
}
printf("]\n");
}
printf("The expected result is : %d\n\n\n\n", res);
t_start = usecs();
butterfly_seq(n, l, array_seq);
t_end = usecs();
printf("Sequential time : %8.2f msec.\n", ((double)t_end - t_start) / 1000.0);
if (n <= 32)
{
printf("The result of the sequential reduction is\n");
printf("array_seq=[");
for (i = 0; i < n; i++)
printf("%d ", array_seq[i]);
printf("]\n");
}
printf("\n\n\n");
t_start = usecs();
butterfly_par(n, l, array_par);
t_end = usecs();
printf("Parallel time : %8.2f msec.\n", ((double)t_end - t_start) / 1000.0);
if (n <= 32)
{
printf("The result of the parallel reduction is\n");
printf("array_par=[");
for (i = 0; i < n; i++)
printf("%d ", array_par[i]);
printf("]\n\n");
}
check_result(n, array_par, res);
}
void butterfly_seq(int n, int l, int *array)
{
int p, i, j, s;
p = 0;
while (p < l)
{
s = pow(2, p);
for (i = 0; i < n; i += 2 * s)
{
for (j = 0; j < s; j++)
{
int r = operator(array[i + j], array[i + j + s]);
array[i + j] = r;
array[i + j + s] = r;
}
}
p += 1;
}
}
// void butterfly_par(int n, int l, int *array){
// int p, i, j, s;
// p = 0;
// while(p<l){
// s = pow(2,p);
// for(i=0; i<n; i+=2*s){
// #pragma omp parallel for
// for(j=0; j<s; j++){
// int r = operator(array[i+j],array[i+j+s]);
// array[i+j] = r;
// array[i+j+s] = r;
// }
// }
// p+=1;
// }
// }
void butterfly_par(int n, int l, int *array)
{
int p, i, j, s;
p = 0;
#pragma omp parallel
#pragma omp single
while (p < l)
{
s = pow(2, p);
for (i = 0; i < n; i += 2 * s)
{
for (j = 0; j < s; j++)
#pragma omp task firstprivate(i, j, s) depend(out : array[i + j], array[i + j + s])
{
{
int r = operator(array[i + j], array[i + j + s]);
array[i + j] = r;
array[i + j + s] = r;
}
}
}
p += 1;
}
}

Binary file not shown.

View file

@ -0,0 +1,8 @@
#!/bin/bash
export LBLAS="/mnt/n7fs/ens/tp_abuttari/acml-4.4.0/gfortran64/lib/libacml.a -lgfortran -lm"
export PATH=/mnt/n7fs/ens/tp_abuttari/opt/gcc-9.2.0/bin/:$PATH;
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/gmp-6.2.1/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/mpfr-4.1.0/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/mpc-1.2.0/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/mnt/n7fs/ens/tp_abuttari/opt/gcc-9.2.0/lib64/:$LD_LIBRARY_PATH

View file

@ -0,0 +1,28 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LBLAS = /mnt/n7fs/ens/tp_abuttari/acml-4.4.0/gfortran64/lib/libacml.a -lm
LIBS = -lm $(LBLAS) -lm -lgfortran
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,117 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "aux.h"
int ISEED[4] = {0,0,0,1};
int IONE=1;
char NoTran = 'N';
double DONE=1.0, DMONE=-1.0;
double alpha=1.0, beta=0.0;
void init_data(layer **L, data **Ds, data **Dpl, data **Dpt, int n, int m, int l){
int i, j, lay, mm;
*L = (layer*) malloc(l*sizeof(layer));
*Ds = (data*) malloc((l+1)*sizeof(data));
*Dpl = (data*) malloc((l+1)*sizeof(data));
*Dpt = (data*) malloc((l+1)*sizeof(data));
mm = m*m;
for(lay=0; lay<l; lay++){
(*L)[lay].W = (block**)malloc(n*sizeof(block*));
(*L)[lay].b = (block* )malloc(n*sizeof(block ));
(*Ds)[lay].X = (block* )malloc(n*sizeof(block ));
(*Dpl)[lay].X = (block* )malloc(n*sizeof(block ));
(*Dpt)[lay].X = (block* )malloc(n*sizeof(block ));
for(i=0; i<n; i++){
((*L)[lay]).W[i] = (block*)malloc(n*sizeof(block));
((*L)[lay]).b[i].b = (double*)malloc(m *sizeof(double));
((*Ds)[lay]).X[i].b = (double*)malloc(mm*sizeof(double));
((*Dpl)[lay]).X[i].b = (double*)malloc(mm*sizeof(double));
((*Dpt)[lay]).X[i].b = (double*)malloc(mm*sizeof(double));
dlarnv_(&IONE, ISEED, &m , ((*L )[lay]).b[i].b);
dlarnv_(&IONE, ISEED, &mm, ((*Ds)[lay]).X[i].b);
dcopy_(&mm, ((*Ds)[lay]).X[i].b, &IONE, ((*Dpl)[lay]).X[i].b, &IONE);
dcopy_(&mm, ((*Ds)[lay]).X[i].b, &IONE, ((*Dpt)[lay]).X[i].b, &IONE);
for(j=0; j<n; j++){
((*L)[lay]).W[i][j].b = (double*)malloc(mm*sizeof(double));
dlarnv_(&IONE, ISEED, &mm, ((*L)[lay]).W[i][j].b);
}
}
}
(*Ds)[l].X = (block* )malloc(n*sizeof(block ));
(*Dpl)[l].X = (block* )malloc(n*sizeof(block ));
(*Dpt)[l].X = (block* )malloc(n*sizeof(block ));
for(i=0; i<n; i++){
((*Ds)[l]).X[i].b = (double*)malloc(mm*sizeof(double));
((*Dpl)[l]).X[i].b = (double*)malloc(mm*sizeof(double));
((*Dpt)[l]).X[i].b = (double*)malloc(mm*sizeof(double));
dlarnv_(&IONE, ISEED, &mm, ((*Ds)[l]).X[i].b);
dcopy_(&mm, ((*Ds)[l]).X[i].b, &IONE, ((*Dpl)[l]).X[i].b, &IONE);
dcopy_(&mm, ((*Ds)[l]).X[i].b, &IONE, ((*Dpt)[l]).X[i].b, &IONE);
}
}
void block_mult(block a, block b, block c, int m){
dgemm_(&NoTran, &NoTran, &m, &m, &m,
&alpha,
a.b, &m,
b.b, &m,
&beta,
c.b, &m);
}
void block_bias_act(block b, block X, int m){
int i, j;
for(j=0; j<m; j++)
for(i=0; i<m; i++)
X.b[m*j+i] = tanh(X.b[m*j+i]+b.b[i]);
}
void compare_output(block *X1, block *X2, int n, int m){
int i, j, k, mm;
double mx;
mm = m*m;
mx = 0.0;
for(i=0; i<n; i++){
for(k=0; k<mm; k++){
if(fabs((X1[i].b)[k]-(X2[i].b)[k])/fabs((X1[i].b)[k]) > mx)
mx = fabs((X1[i].b)[k]-(X2[i].b)[k])/fabs((X1[i].b)[k]);
}
}
printf("The maximum difference on coefficients is %e\n",mx);
}
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}

View file

@ -0,0 +1,30 @@
struct {
double *b;
} typedef block;
struct {
block **W, *b;
} typedef layer;
struct {
block *X;
} typedef data;
void block_mult(block a, block b, block c, int nb);
void block_bias_act(block a, block b, int nb);
void init_data(layer **L, data **Ds, data **Dpl, data **Dpt, int n, int nb, int l);
long usecs ();
void compare_output(block *X1, block *, int n, int nb);
void dgemm_ (char *TRANSA, char *TRANSB,
int *M, int *N, int *K,
double *ALPHA,
double *A, int *LDA,
double *B, int *LDB,
double *BETA,
double *C, int *LDC);
void dlarnv_(int *idist, int *iseed, int *n, double *x);
void dcopy_(int *n, double *x, int *ix, double *y, int *iy);

View file

@ -0,0 +1,129 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "omp.h"
#include "aux.h"
void sequential_nn(layer *Layers, data *D, int n, int m, int L);
void parallel_nn_loops(layer *Layers, data *D, int n, int m, int L);
void parallel_nn_tasks(layer *Layers, data *D, int n, int m, int L);
int main(int argc, char **argv)
{
int n, m, N, L;
long t_start, t_end;
layer *Layers;
data *Datas_s, *Datas_pl, *Datas_pt;
// Command line arguments
if (argc == 4)
{
n = atoi(argv[1]); /* size of layer matrix n */
m = atoi(argv[2]); /* size of batch */
L = atoi(argv[3]); /* number of layers in the network */
}
else
{
printf("Usage:\n\n ./main n m L\n\nsuch that nxm is the size of the layers and L is the number of layers and m is the batch size.\n");
return 1;
}
N = (n / m);
init_data(&Layers, &Datas_s, &Datas_pl, &Datas_pt, N, m, L);
/* Sequential version */
t_start = usecs();
sequential_nn(Layers, Datas_s, N, m, L);
t_end = usecs();
printf("Sequential time : %8.2f msec.\n", ((double)t_end - t_start) / 1000.0);
/* Parallel with loops */
t_start = usecs();
parallel_nn_loops(Layers, Datas_pl, N, m, L);
t_end = usecs();
printf("Parallel loops time : %8.2f msec. ", ((double)t_end - t_start) / 1000.0);
/* Comprare the two resulting outputs */
compare_output(Datas_s[L].X, Datas_pl[L].X, N, m);
/* Parallel with tasks */
t_start = usecs();
parallel_nn_tasks(Layers, Datas_pt, N, m, L);
t_end = usecs();
printf("Parallel tasks time : %8.2f msec. ", ((double)t_end - t_start) / 1000.0);
/* Comprare the two resulting outputs */
compare_output(Datas_s[L].X, Datas_pt[L].X, N, m);
return 0;
}
void sequential_nn(layer *Layers, data *Datas, int N, int m, int L)
{
int i, j, k, l;
for (l = 0; l < L; l++)
{
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
{
block_mult(Layers[l].W[i][j], Datas[l].X[j], Datas[l + 1].X[i], m);
}
for (i = 0; i < N; i++)
block_bias_act(Layers[l].b[i], Datas[l + 1].X[i], m);
}
}
void parallel_nn_loops(layer *Layers, data *Datas, int N, int m, int L)
{
int i, j, l;
#pragma omp parallel private(i, j, l)
for (l = 0; l < L; l++)
{
#pragma omp for
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
block_mult(Layers[l].W[i][j], Datas[l].X[j], Datas[l + 1].X[i], m);
}
}
#pragma omp for
for (i = 0; i < N; i++)
{
block_bias_act(Layers[l].b[i], Datas[l + 1].X[i], m);
}
}
}
void parallel_nn_tasks(layer *Layers, data *Datas, int N, int m, int L)
{
int i, j, l;
#pragma omp parallel
#pragma omp master
for (l = 0; l < L; l++)
{
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
#pragma omp task firstprivate(i, j, l) depend(in: Datas[l].X[j]) depend(inout: Datas[l + 1].X[i])
block_mult(Layers[l].W[i][j], Datas[l].X[j], Datas[l + 1].X[i], m);
}
for (i = 0; i < N; i++)
{
#pragma omp task firstprivate(i, l) depend(inout: Datas[l + 1].X[i])
block_bias_act(Layers[l].b[i], Datas[l + 1].X[i], m);
}
}
}
}

Binary file not shown.

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

139
BE_OpenMP_2021/norm2/main.c Normal file
View file

@ -0,0 +1,139 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
double dnorm2_seq(double *x, int n);
double dnorm2_par_red(double *x, int n);
double dnorm2_par_nored(double *x, int n);
int main(int argc, char *argv[]){
int n, i;
double *x;
double n2_seq, n2_par_red, n2_par_nored;
long t_start,t_end;
if(argc!=2){
printf("Wrong number of arguments.\n Usage:\n\n\
./main n \n\n where n is the size of the vector x whose 2-norm has to be computed.\n");
return 1;
}
sscanf(argv[1],"%d",&n);
x = (double*)malloc(sizeof(double)*n);
for(i=0; i<n; i++)
x[i] = ((double) rand() / (RAND_MAX));
printf("\n================== Sequential version ==================\n");
t_start = usecs();
n2_seq = dnorm2_seq(x, n);
t_end = usecs();
printf("Time (msec.) : %7.1f\n",(t_end-t_start)/1e3);
printf("Computed norm is: %10.3lf\n",n2_seq);
printf("\n\n=========== Parallel version with reduction ===========\n");
t_start = usecs();
n2_par_red = dnorm2_par_red(x, n);
t_end = usecs();
printf("Time (msec.) : %7.1f\n",(t_end-t_start)/1e3);
printf("Computed norm is: %10.3lf\n",n2_par_red);
printf("\n========== Parallel version without reduction ==========\n");
t_start = usecs();
n2_par_nored = dnorm2_par_nored(x, n);
t_end = usecs();
printf("Time (msec.) : %7.1f\n",(t_end-t_start)/1e3);
printf("Computed norm is: %10.3lf\n",n2_par_nored);
printf("\n\n");
if(fabs(n2_seq-n2_par_red)/n2_seq > 1e-10) {
printf("The parallel version with reduction is numerically wrong! \n");
} else {
printf("The parallel version with reduction is numerically okay!\n");
}
if(fabs(n2_seq-n2_par_nored)/n2_seq > 1e-10) {
printf("The parallel version without reduction is numerically wrong!\n");
} else {
printf("The parallel version without reduction is numerically okay!\n");
}
return 0;
}
double dnorm2_seq(double *x, int n){
int i;
double res;
res = 0.0;
for(i=0; i<n; i++)
res += x[i]*x[i];
return sqrt(res);
}
double dnorm2_par_red(double *x, int n){
int i;
double res;
res = 0.0;
#pragma omp parallel for reduction(+:res)
for(i=0; i<n; i++) {
res += x[i]*x[i];
}
return sqrt(res);
}
double dnorm2_par_nored(double *x, int n){
int i, iam;
double *res;
double sum;
#pragma omp parallel
#pragma omp master
{
res = (double*) malloc(sizeof(double) * omp_get_num_threads());
}
res[omp_get_thread_num()] = 0.0;
#pragma omp for
for(i=0; i<n; i++) {
res[omp_get_thread_num()] += x[i]*x[i];
}
#pragma atomic update
sum += res[omp_get_thread_num()];
#pragma omp master
{
free(res);
}
return sqrt(sum);
}

Binary file not shown.

8
BE_OpenMP_2021/pack.sh Executable file
View file

@ -0,0 +1,8 @@
#!/bin/bash
make clean;
mkdir $USER
cp -r butterfly neural_network norm2 pipelining tree_bottomup $USER;
cd $USER; find . -not \( -name "*.c" -or -name "Makefile" -or -name "*.h" -or -type d \) | xargs rm; cd ..;
tar zcvf $USER.tgz $USER;
rm -rf $USER;

View file

@ -0,0 +1,27 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,105 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "aux.h"
#include "omp.h"
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
void init_data(data **datas, resource **resources, int ndatas, int nsteps){
int d, s;
*datas = (data*) malloc(ndatas*sizeof(data));
*resources = (resource*) malloc(nsteps*sizeof(resource));
for (d=0; d<ndatas; d++){
(*datas)[d].i = -1;
(*datas)[d].v = d;
}
for (s=0; s<nsteps; s++){
(*resources)[s].i = s;
(*resources)[s].nr = nsteps;
(*resources)[s].busy = -9;
}
return;
}
void process_data(data *datas, int d, int s, resource *r){
double t;
if(r->busy == -9){
r->busy = omp_get_thread_num();
if(r->i != s) {
printf("Error!!! trying to use resource %d for step %d.\n",r->i,s);
r->busy = -9;
return;
}
if(datas[d].i == s-1){
t = 0.2/((double)r->nr);
mysleep(t);
datas[d].i = s;
datas[d].v = datas[d].v*2 + s;
} else {
printf("Error!!! trying step %d on data %d but step %d is not done.\n",s,d,s-1);
r->busy = -9;
return;
}
r->busy = -9;
} else {
printf("Error!!! trying to use resource %d but it is busy.\n",r->i);
}
}
void check_result(data *datas, int ndatas, int nsteps){
int d, s;
long v;
for (d=0; d<ndatas; d++){
v = d;
for (s=0; s<nsteps; s++){
v = v*2+s;
}
if (datas[d].v != v) {
printf("The result is NOT correct!!!\n");
return;
}
}
printf("The result is correct!!!\n");
}

View file

@ -0,0 +1,14 @@
struct {
int i, nr, busy;
} typedef resource;
struct {
int i;
long v;
} typedef data;
long usecs ();
void process_data(data *datas, int d, int s, resource *r);
void init_data(data **datas, resource **resources, int ndatas, int nsteps);
void check_result(data *datas, int ndatas, int nsteps);

View file

@ -0,0 +1,69 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <math.h>
#include "omp.h"
#include "aux.h"
void pipeline(data *datas, resource *resources, int ndatas, int nsteps);
int main(int argc, char **argv){
int n, i, s, d, ndatas, nsteps;
long t_start, t_end;
data *datas;
resource *resources;
// Command line arguments
if ( argc == 3 ) {
ndatas = atoi(argv[1]); /* num of datas */
nsteps = atoi(argv[2]); /* num of steps */
} else {
printf("Usage:\n\n ./main ndatas nsteps\n where ndatas is the number of data and nsteps the number of steps.\n");
return 1;
}
init_data(&datas, &resources, ndatas, nsteps);
/* Process all the data */
t_start = usecs();
pipeline(datas, resources, ndatas, nsteps);
t_end = usecs();
printf("Execution time : %8.2f msec.\n",((double)t_end-t_start)/1000.0);
check_result(datas, ndatas, nsteps);
return 0;
}
void pipeline(data *datas, resource *resources, int ndatas, int nsteps){
int d, s;
omp_lock_t *locks = malloc(sizeof(omp_lock_t)*nsteps);
for (s = 0; s < nsteps; s++)
{
omp_init_lock(locks+s);
}
/* Loop over all the data */
#pragma omp parallel for private(s)
for (d=0; d<ndatas; d++){
/* Loop over all the steps */
for (s=0; s<nsteps; s++){
omp_set_lock(locks+s);
process_data(datas, d, s, &(resources[s]));
omp_unset_lock(locks+s);
}
}
for (s = 0; s < nsteps; s++)
{
omp_destroy_lock(locks+s);
}
}

Binary file not shown.

BIN
BE_OpenMP_2021/subject.pdf Normal file

Binary file not shown.

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main *.dot)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,124 @@
#include "aux.h"
int tot_visited, to_be_visited;
unsigned long res;
unsigned long check_res;
void generate_tree(int nnodes, struct node ***leaves, int *nleaves){
FILE *fp;
char *mode = "w";
int id = nnodes;
*leaves = (struct node **)malloc(nnodes*sizeof(struct node *));
*nleaves = 0;
tot_visited = 0;
res = 0;
check_res = 0;
to_be_visited = nnodes;
if(nnodes<=100){
fp = fopen("td_tree.dot", mode);
fprintf(fp, "graph G {\nnode [color=black,\nfillcolor=white,\nshape=circle,\nstyle=filled\n];\n");
} else {
fp = NULL;
}
generate_node(NULL, nnodes, &id, *leaves, nleaves, fp);
if(nnodes<=100){
fprintf(fp, "}\n");
fclose(fp);
}
}
void generate_node(struct node *parent, int nnodes, int *id, struct node **leaves, int *nleaves, FILE *fp){
int i;
struct node *curr;
curr = (struct node*) malloc(sizeof(struct node));
curr->parent = parent;
curr->id = *id;
*id = *id - 1;
nnodes = nnodes -1;
unsigned nc = (rand()%NCHILDREN) +2;
nc = MIN(nc,nnodes);
curr->data = rand()%100 +1;
res += curr->data;
if(nc==0) {
leaves[*nleaves] = curr;
*nleaves += 1;
}
if(curr->parent==NULL){
/* printf("Node %5d -- nc %d parent N\n",curr->id, nc); */
} else {
/* printf("Node %5d -- nc %d parent %d\n",curr->id, nc, curr->parent->id); */
if(fp!=NULL)
fprintf(fp, "%d -- %d \n", curr->parent->id, curr->id);
}
for(i=0; i<nc; i++) {
unsigned nn=nnodes/nc;
if(i==nc-1)
nn = nn + nnodes%nc;
generate_node(curr, nn, id, leaves, nleaves, fp);
}
}
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
void process_node(struct node *node){
/* if(node->parent!=NULL){ */
/* printf("%d -- Visiting node %d %d\n",omp_get_thread_num(), node->id, node->parent->visited); */
/* } else { */
/* printf("%d -- Visiting node %d\n",omp_get_thread_num(), node->id); */
/* } */
#pragma omp atomic update
tot_visited++;
#pragma omp atomic update
check_res += node->data;
mysleep(0.0001);
return ;
}
void check_result(){
if((tot_visited==to_be_visited) && (check_res==res)){
printf("The result is CORRECT\n");
} else {
printf("The result is WRONG!!!! %d %lu %lu\n",tot_visited,res,check_res);
}
}

View file

@ -0,0 +1,23 @@
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <sys/time.h>
#define NCHILDREN 2
#define MIN(a,b) (((a)<(b))?(a):(b))
struct node{
unsigned id;
unsigned long data;
struct node *parent;
};
void generate_node(struct node *parent, int nnodes, int *id, struct node **leaves, int *nleaves, FILE *fp);
void process_node(struct node *node);
long usecs ();
void check_result();
void generate_tree(int nnodes, struct node ***leaves, int *nleaves);

View file

@ -0,0 +1,65 @@
#include "aux.h"
void bottom_up(int nleaves, struct node **leaves, int nnodes);
int main(int argc, char **argv){
long t_start, t_end;
int nnodes, nleaves;
struct node **leaves;
// Command line argument: number of nodes in the tree
if ( argc == 2 ) {
nnodes = atoi(argv[1]);
} else {
printf("Usage:\n\n ./main n\n\nwhere n is the number of nodes in the tree.\n");
return 1;
}
printf("\nGenerating a tree with %d nodes\n\n",nnodes);
generate_tree(nnodes, &leaves, &nleaves);
t_start = usecs();
bottom_up(nleaves, leaves, nnodes);
t_end = usecs();
printf("Parallel time : %8.2f msec.\n\n",((double)t_end-t_start)/1000.0);
check_result();
}
/* You can change the number and type of arguments if needed. */
/* Just don't forget to update the interface declaration above. */
void bottom_up(int nleaves, struct node **leaves, int nnodes){
int l, i, v;
struct node *curr;
int *visited = malloc(sizeof(int) * nnodes);
#pragma omp paralell for private(curr, v)
for (l = 0; l < nleaves; l++)
{
curr = leaves[l];
while (curr) {
#pragma omp atomic capture
v = visited[curr->id-1]++;
if (v>0)
{
break
}
else {
process_node(curr);
curr = curr->parent;
}
}
}
}

Binary file not shown.

13
BE_OpenMP_2022/Makefile Normal file
View file

@ -0,0 +1,13 @@
clean:
(cd norm1; make clean)
(cd server; make clean)
(cd sched; make clean)
(cd sparse_nn; make clean)
(cd ring; make clean)

View file

@ -0,0 +1,31 @@
CC = gcc
LINK = $(CC)
CFLAGS = -O3 -fopenmp -Wunknown-pragmas
# CFLAGS = -g -fopenmp
LDFLAGS = $(CFLAGS)
LIBS = -lm
OBJS = aux.o main.o
main: $(OBJS)
$(LINK) -o $@ $(LDFLAGS) $(OBJS) $(LIBS)
clean:
(rm -f *.o main)
%.o: %.c
$(CC) $(CFLAGS) $(CINCLUDES) $(INCLUDES) $(CDEFS) $(PREC) -c $<
%.o: %.f90
$(FC) $(FCFLAGS) $(FINCLUDES) $(INCLUDES) $(FDEFS) $(DEFINE_PREPEND)$(PREC) -c $<

View file

@ -0,0 +1,54 @@
#include "aux.h"
#define timelim 2.0
long usecs (){
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec*1000000+t.tv_usec;
}
void mysleep(double sec){
long s, e;
s=0; e=0;
s = usecs();
while(((double) e-s)/1000000 < sec)
{
e = usecs();
}
return;
}
void func1(int i, int n){
double x = timelim/((double)n);
mysleep(x);
};
void func2(int i, int n){
/* double x = 2.0*((double)rand() / (double)RAND_MAX) * timelim / ((double)(n)); */
/* mysleep(x); */
double x = 2.0*timelim/ ((double)(n*n));
mysleep(((double)(i+1))*x);
};
void func3(int i, int n){
double x = 2.0*timelim/ ((double)(n*n));
mysleep(((double)(n-i))*x);
};

Some files were not shown because too many files have changed in this diff Show more