init

2023-06-23 19:34:09 +02:00 · 2023-06-23 19:34:09 +02:00 · f0f362eeee
commit f0f362eeee
66 changed files with 12547 additions and 0 deletions
--- a/BE/01_RingD/Makefile
+++ b/BE/01_RingD/Makefile
@ -0,0 +1,17 @@
 MPICC=smpicc
 CFLAGS=-g -O4
 DIR=01_RingD
 SRC=ringd
 all: ${SRC}
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $<
 ${SRC}: ${SRC}.o
 	$(MPICC) -o $@ $^
 clean:
 	rm -rf *.o ${SRC}
--- a/BE/01_RingD/ringd.c
+++ b/BE/01_RingD/ringd.c
@ -0,0 +1,136 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 int main(int argc, char *argv[])
 {
  MPI_Init(&argc, &argv);
  int comm_size;
  MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
  if (comm_size % 2 != 0)
  {
    printf("This application is meant to be run with an even number of MPI processes, not %d.\n", comm_size);
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
  }
  // Get my rank in the global communicator
  int my_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  // Determine the colour and key based on whether my rank is even.
  char subcommunicator;
  int colour;
  int key;
  if (my_rank % 2 == 0)
  {
    subcommunicator = 'E';
    colour = 0;
    key = my_rank;
  }
  else
  {
    subcommunicator = 'O';
    colour = 1;
    key = comm_size - my_rank;
  }
  // Split of the global communicator
  MPI_Comm new_comm;
  MPI_Comm_split(MPI_COMM_WORLD, colour, key, &new_comm);
  int my_new_comm_rank, new_comm_size;
  // Get my rank in the new communicator
  MPI_Comm_rank(new_comm, &my_new_comm_rank);
  // Get the size of the new communicator
  MPI_Comm_size(new_comm, &new_comm_size);
  // Print my new rank and new communicator
  printf("[MPI process %d] I am now MPI process %d in subcommunicator %c.\n", my_rank, my_new_comm_rank, subcommunicator);
  // barriere pour clean un peu le stdout
  // MPI_Barrier(MPI_COMM_WORLD);
  int previous, next;
  // determine my neighbours according to my rank in my subcommunicator
  if (my_new_comm_rank == 0)
  {
    previous = new_comm_size - 1;
    next = my_new_comm_rank + 1;
  }
  else if (my_new_comm_rank == new_comm_size - 1)
  {
    previous = my_new_comm_rank - 1;
    next = 0;
  }
  else
  {
    previous = my_new_comm_rank - 1;
    next = my_new_comm_rank + 1;
  }
  // printf("[MPI process %d] new %d previous %d next %d in subcommunicator %c.\n", my_rank, my_new_comm_rank, previous, next, subcommunicator);
  float value = 1.0;
  MPI_Status status;
  // Even: clockwise + multiplication
  if (subcommunicator == 'E')
  {
    // receive value from previous node
    if (my_new_comm_rank != 0)
    {
      MPI_Recv(&value, 1, MPI_FLOAT, previous, 0, new_comm, &status);
      printf("[MPI process %d_%c] RECEIVED from process %d of %d, value = %f\n", my_rank, subcommunicator, my_new_comm_rank, new_comm_size, value);
      value = value * 2.0;
      printf("[MPI process %d_%c] UPDATE, value = %f\n", my_rank, subcommunicator, value);
    }
    else
    {
      printf("[MPI process %d_%c] START, value = %f\n", my_rank, subcommunicator, value);
    }
    // send value to next node
    if (my_new_comm_rank != new_comm_size - 1)
    {
      MPI_Send(&value, 1, MPI_FLOAT, next, 0, new_comm);
      printf("[MPI process %d_%c] SENT to process %d of %d, value = %f\n", my_rank, subcommunicator, my_new_comm_rank, new_comm_size, value);
    }
  }
  // Odd: counter-clockwise + division
  if (subcommunicator == 'O')
  {
    // receive value from next node
    if (my_new_comm_rank != 0)
    {
      MPI_Recv(&value, 1, MPI_FLOAT, next, 0, new_comm, &status);
      printf("[MPI process %d_%c] RECEIVED from process %d of %d, value = %f\n", my_rank, subcommunicator, my_new_comm_rank, new_comm_size, value);
      value = value / 2.0;
      printf("[MPI process %d_%c] UPDATE, value = %f\n", my_rank, subcommunicator, value);
    }
    else
    {
      printf("[MPI process %d_%c] START, value = %f\n", my_rank, subcommunicator, value);
    }
    // send value to previous node
    if (my_new_comm_rank != 1)
    {
      MPI_Send(&value, 1, MPI_FLOAT, previous, 0, new_comm);
      printf("[MPI process %d_%c] SENT to process %d of %d, value = %f\n", my_rank, subcommunicator, my_new_comm_rank, new_comm_size, value);
    }
  }
  // barrière pour clean un peu le stdout
  // MPI_Barrier(MPI_COMM_WORLD);
  // the end
  printf("[MPI process %d_%c] The End\n", my_rank, subcommunicator);
  // Free the communicator
  MPI_Finalize();
  return EXIT_SUCCESS;
 }
--- a/BE/02_normA/Makefile
+++ b/BE/02_normA/Makefile
@ -0,0 +1,17 @@
 MPICC=smpicc
 CFLAGS=-g -O4
 DIR=02_normA
 SRC=normA
 all: ${SRC}
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $<
 ${SRC}: ${SRC}.o
 	$(MPICC) -o $@ $^
 clean:
 	rm -rf *.o ${SRC} ${DIR}
--- a/BE/02_normA/normA.c
+++ b/BE/02_normA/normA.c
@ -0,0 +1,159 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <mpi.h>
 void multAv(double x[], double *A, double y[], int m, int n);
 void init0(double x[], int n);
 double dot(double x[], double y[], int n);
 int main(int argc, char *argv[])
 {
  int size;
  int const n = 12;
  int my_rank;
  double local_dot, global_dot, normA, reference;
  MPI_Init(&argc, &argv);
  // Get number of processes and check that 4 processes are used
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  if (size != 4)
  {
    printf("This application is meant to be run with 4 MPI processes.\n");
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
  }
  // Get my rank
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  //  Declaration and Initialization of A (one for all components)
  //  the blocking on rows, b, is the same for all nodes
  //  (if you don't change the constants)
  int b = n / size;
  double *A;
  A = (double *)malloc(b * n * sizeof(double));
  for (int i = 0; i < b; i++)
  {
    for (int j = 0; j < n; j++)
    {
      A[i * n + j] = 1.0;
      reference = 66.000000; // sum_{i=1}^{12-1}
      // A[i*n + j] = (double) my_rank;
      // reference = 97.488461;
      // A[i*n + j] = (double) my_rank*(i+1)+(j+1);
      // reference = 239.899979;
      // printf("Process [%d], A[%d][%d] = %f\n", my_rank, i, j, A[i*n+j]);
    }
  }
  // reference vector to verify that the global vector is correct
  double v_ref[n];
  for (int i = 0; i < n; i++)
  {
    v_ref[i] = (double)i;
  }
  // local vector
  double x_local[b];
  for (int i = 0; i < b; i++)
  {
    x_local[i] = (double)b * my_rank + i;
    // printf("Process [%d], v_local[%d] = %f\n", my_rank, i, v_local[i]);
  }
  // global vector
  double x_global[n];
  init0(x_global, n);
  // Use a collective communication in order to gather on ALL the nodes the
  // part of the local vector into the global vector
  MPI_Allgather(x_local, b, MPI_DOUBLE, x_global, b, MPI_DOUBLE, MPI_COMM_WORLD);
  // the node 2 checks if the global vector is correct (should be 0 for all components)
  if (my_rank == 2)
  {
    for (int i = 0; i < n; i++)
    {
      printf("Process [%d], vérif[%d] = %f\n", my_rank, i, x_global[i] - v_ref[i]);
    }
  }
  MPI_Barrier(MPI_COMM_WORLD);
  // vector y_local = A * x_global
  double y_local[b];
  init0(y_local, b);
  // Perform the multiplication
  multAv(y_local, A, x_global, b, n);
  // each node displays y (with A, full of ones, all the components of x
  // should be the same)
  for (int i = 0; i < b; i++)
  {
    printf("Process [%d] y_local[%d] = %f\n", my_rank, i, y_local[i]);
  }
  // Perform the dot product on the local x
  local_dot = dot(x_local, y_local, b);
  printf("Process [%d] local dot %f\n", my_rank, local_dot);
  // Use one single collective communication to perfom the reduction in
  // global_dot
  MPI_Allreduce(&local_dot, &global_dot, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  // the norm is the square root of the global_dot
  normA = sqrt(global_dot);
  // Another node displays the norm
  if (my_rank == 2)
  {
    printf("Process [%d] normA = %f, reference = %f\n", my_rank, normA, reference);
  }
  MPI_Finalize();
  return EXIT_SUCCESS;
 }
 void multAv(double x[], double *A, double y[], int m, int n)
 {
  for (int i = 0; i < m; i++)
  {
    x[i] = 0.0;
    for (int j = 0; j < n; j++)
    {
      x[i] += A[i * n + j] * y[j];
    }
  }
  return;
 }
 void init0(double x[], int n)
 {
  for (int i = 0; i < n; i++)
  {
    x[i] = 0.0;
  }
  return;
 }
 double dot(double x[], double y[], int n)
 {
  double res = 0.0;
  for (int i = 0; i < n; i++)
  {
    res += x[i] * y[i];
  }
  return res;
 }
--- a/BE/03_overmean/Makefile
+++ b/BE/03_overmean/Makefile
@ -0,0 +1,18 @@
 MPICC=smpicc
 CFLAGS=-g -O4
 DIR=03_overmean
 SRC=overmean
 all: ${SRC}
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $<
 ${SRC}: ${SRC}.o
 	$(MPICC) -o $@ $^
 clean:
 	rm -rf *.o ${SRC} ${DIR}
--- a/BE/03_overmean/overmean.c
+++ b/BE/03_overmean/overmean.c
@ -0,0 +1,120 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <mpi.h>
 int main(int argc, char *argv[])
 {
  // comment this line, if you want the same vector for each run
  srand(time(NULL));
  MPI_Init(&argc, &argv);
  // Get number of processes
  int nb_process;
  MPI_Comm_size(MPI_COMM_WORLD, &nb_process);
  // Fix root's rank
  int root_rank = 0;
  // Get my rank
  int my_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  // global size (only the root know its value)
  int global_size = 0;
  // local size (we fix this value in order to be regular)
  int local_size = 3;
  // local vector
  int *local_vector = NULL;
  int *global_vector = NULL;
  // root process
  if (my_rank == root_rank)
  {
    global_size = nb_process * local_size; // to be able to split
                                           // the global vector into sub-vectors
                                           // with the same size
    printf("global_size = %d\n", global_size);
    global_vector = (int *)malloc(sizeof(int) * global_size);
    for (int i = 0; i < global_size; i++)
    {
      // global_vector[i] = i;
      global_vector[i] = rand() % 101;
      printf("global_vector[%d] = %d\n", i, global_vector[i]);
    }
  }
  // Each process gets its part of the global vector
  local_vector = (int *)malloc(sizeof(int) * local_size);
  MPI_Scatter(global_vector, local_size, MPI_INT, local_vector, local_size, MPI_INT, root_rank, MPI_COMM_WORLD);
  // print the local vector
  for(int i = 0; i < local_size; i++)
  {
    printf("[%d] local_vector[%d] = %d\n", my_rank, i, local_vector[i]);
  }
  // barriere pour clean un peu le stdout
  // MPI_Barrier(MPI_COMM_WORLD);
  // compute the local sum
  int local_sum = 0.0;
  for (int i = 0; i < local_size; i++)
  {
    local_sum += local_vector[i];
  }
  printf("Process %d computed its local sum = %d.\n", my_rank, local_sum);
  // compute the global sum by a reduction
  int global_sum;
  MPI_Reduce(&local_sum, &global_sum, 1, MPI_INT, MPI_SUM, root_rank, MPI_COMM_WORLD);
  // print the global sum
  if (my_rank == root_rank) {
    printf("Process %d got the global sum = %d.\n", my_rank, global_sum);
  }
  // barriere pour clean un peu le stdout
  // MPI_Barrier(MPI_COMM_WORLD);
  float mean; // float!!
  // the root computes the mean (only one to know the global size)
  if (my_rank == root_rank)
  {
    mean = ((float)global_sum) / global_size;
    printf("Process %d computed the mean = %f.\n", my_rank, mean);
  }
  // broadcast of the mean to all process
  MPI_Bcast(&mean, 1, MPI_FLOAT, root_rank, MPI_COMM_WORLD);
  // print the mean
  printf("Process %d got the mean = %f.\n", my_rank, mean);
  // barriere pour clean un peu le stdout
  // MPI_Barrier(MPI_COMM_WORLD);
  // compute the number of values (from the local vector) over the mean
  int local_number = 0;
  for (int i = 0; i < local_size; i++)
  {
    if (local_vector[i] >= mean)
      local_number++;
  }
  printf("Process %d has %d values over the mean.\n", my_rank, local_number);
  // reduce these numbers on root process
  int over_the_mean;
  MPI_Reduce(&local_number, &over_the_mean, 1, MPI_INT, MPI_SUM, root_rank, MPI_COMM_WORLD);
  // print the total number of values over the mean
  if (my_rank == root_rank) {
    printf("the total number of values over the mean is %d.\n", over_the_mean);
  }
  MPI_Finalize();
  return EXIT_SUCCESS;
 }
--- a/BE/04_n-corps/n-corps.md
+++ b/BE/04_n-corps/n-corps.md
@ -0,0 +1,170 @@
 # Exercice 4 : problème aux N-corps
 Ce fichier fait partie du rendu évalué pour le BE de Calcul parallèle.
 ## Question 1
 Déterminer quels calculs peuvent être parallélisés et quelles communications mettre en place dans le code séquentiel suivant. Proposer une réécriture parallèle avectransmission de messages de ce code.
 ```
 variables : force[1,...,N], data[1,...,N]
 for t in 1, nb_steps do
  for i in 1, N do
    force[i] = 0
    for j in 1, N do
      force[i] = force[i] + interaction(data[i], data[j])
    end for
  end for
  for i in 1, N do
    data[i] = update(data[i], force[i])
  end for
 end for
 ```
 ### Réponse Q2
 On suppose que l'on possède K processus, tels que N divise K.
 Par exemple K = 2N.
 ```C
 variables (globales) : K, N, ratio
 variables (locales)  : ik, force[1,...,ratio]
 variables            : data[1,...,N]
 // data est à la fois globale et locale car on le communique entre les processus
 ratio = K / N
 // Chaque processus k va s'occuper de `ratio` corps
 // par exemple si `ratio` = 2
 // processus 0 -> corps 0 + corps 1
 // processus 1 -> corps 2 + corps 3
 // ...
 // Chaque processus doit connaitre `data`
 // (seul le process 0 connait `data` au début)
 // -> Broadcast data from 0 to all
 // cette boucle n'est pas parallélisable
 // on a besoin de t-1 pour calculer t
 for t in 1, nb_steps do 
  ik = 0
  // cette boucle est parralélisable
  // dans le code on va "split" `N` par paquets de `ratio`
  for i in 1, N do
    if je_mocuppe_de_ce_corps(i, N, K) // on peut split de cette manière
      // on reset les forces
      force[ik] = 0
      // on calcule la force totale des corps qu'on s'occupe
      for j in 1, N do
        force[ik] = force[ik] + interaction(data[i], data[j])
      end for
      // on update notre `data` local
      data[i] = update(data[i], force[ik])
      ik++
    end if
  end for
  // une fois chaque `data` updaté localement (dans chaque processus)
  // il faut rassembler toutes ces infos
  // -> All_Gather des data locaux
  // on obtient un `data` synchronisé entre tous les processus,
  // comme lors du premier broadcast
 end for
 ```
 ## Question 2
 Proposer une version parallèle du code suivant.
 ```
 variables : force[1,...,N], data[1,...,N]
 for t in 1, nb_steps do
  for i in 1, N do
    force[i] = 0
  end for
  for i in 1, N do
    for j in 1, i-1 do
      f = interaction(data[i],data[j])
      force[i] = force[i] + f
      force[j] = force[j] - f
    end for
  end for
  for i in 1, N do
    data[i] = update(data[i], force[i])
  end for
 end for
 ```
 ### Réponse Q2
 ```C
 variables (globales) : force[1,...,N], data[1,...,N]
 // Chaque processus doit connaitre `data`
 // (seul le process 0 connait `data` au début)
 // -> Broadcast data from 0 to all
 // cette boucle n'est pas parallélisable
 // on a besoin de t-1 pour calculer t
 for t in 1, nb_steps do
  // on calcul les forces (plus efficacement)
  // on effectue N(N-1)/2 appels à `interaction`
  for i in 1, N do
    if je_mocuppe_de_ce_corps(i, N, K) // je m'occupe de cette "colonne"
      // on reset les forces
      force[i] = 0
      // on calcule la force totale des corps qu'on s'occupe
      for j in 1, i-1 do
        f = interaction(data[i],data[j])
        force[i] = force[i] + f
        force[j] = force[j] - f
      end for
      // on reduce les forces que l'on a calculé pour chaque corps
      // -> All_reduce
      // on update notre `data` local
      data[i] = update(data[i], force[i])
    end if
  end for
  // une fois chaque `data` updaté localement (dans chaque processus)
  // il faut rassembler toutes ces infos
  // -> All_Gather des data locaux
  // on obtient un `data` synchronisé entre tous les processus,
  // comme lors du premier broadcast
 end for
 ```
 ## Question 3
 Quels sont les inconvénients de cette version ?
 Proposer une solution pour les atténuer.
 ### Réponse Q3
 L'inconvénient de cette version est que l'on doit désormais répartir des calculs "en triangle". En effet puisque l'on ne calcul aucune redondance de interaction, on effectue les calculs suivants:
 |   | 0 | 1 | 2 | 3 |
 |:-:|:-:|:-:|:-:|:-:|
 | 0 |   | x | x | x |
 | 1 |   |   | x | x |
 | 2 |   |   |   | x |
 | 3 |   |   |   |   |
 On doit alors effectuer $\frac{N(N-1)}2$ calculs, ce qui est plus compliqué à répartir sur $K = \frac{N}{ratio}$ processus. La manière naïve que j'ai utilisé pour paralléliser le code Question 2 est sous optimal puisque la chaque de calcul entre chaque processus n'est pas égal.
 Une manière plus efficace serait, un peu comme dans openMP, de créer des tasks pour chaque calcul de `interaction` et de répartir uniformément ces tasks entre chaque processus.
--- a/BE/Makefile
+++ b/BE/Makefile
@ -0,0 +1,11 @@
 SOURCES=01_RingD 02_normA 03_overmean 04_n-corps
 all: collect
 collect:
 	echo ${USER}
 	(cd 01_RingD; make clean)
 	(cd 02_normA; make clean)
 	(cd 03_overmean; make clean)
 	tar cvf Calcul_${USER}_`hostname | cut -d'.' -f1`.tar ${SOURCES}
--- a/BE/init.sh
+++ b/BE/init.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 SIMGRID=/mnt/n7fs/ens/tp_guivarch/opt2021/simgrid-3.31
 export PATH=${SIMGRID}/bin:${PATH}
 alias smpirun="smpirun -hostfile ${SIMGRID}/archis/cluster_hostfile.txt -platform ${SIMGRID}/archis/cluster_crossbar.xml"
--- a/TP1/00_Who_am_i/Makefile
+++ b/TP1/00_Who_am_i/Makefile
@ -0,0 +1,15 @@
 MPICC=smpicc
 CFLAGS=-g -O4
 all: who_am_i
 clean:
 	rm -f *.o who_am_i
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $< 
 who_am_i: who_am_i.o
 	$(MPICC) -o $@ $^ 
--- a/TP1/00_Who_am_i/who_am_i.c
+++ b/TP1/00_Who_am_i/who_am_i.c
@ -0,0 +1,26 @@
 #include <stdio.h>
 #include <mpi.h>
 int main( int argc, char *argv[] ) {
  int rank, size;
  int l;
  char name[MPI_MAX_PROCESSOR_NAME];
  MPI_Init( &argc, &argv );
  // Get rank
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  // Get size
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  // Get name
  MPI_Get_processor_name (name , &l);
  printf("Hello world from process %d of %d on processor named %s\n", rank, size, name);
  MPI_Finalize();
  return 0;
 }
--- a/TP1/01_Ring/Makefile
+++ b/TP1/01_Ring/Makefile
@ -0,0 +1,14 @@
 MPICC=smpicc
 CFLAGS=-g -O4
 all: ring
 clean:
 	rm -rf *.o ring
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $< 
 ring: ring.o
 	$(MPICC) -o $@ $^ 
--- a/TP1/01_Ring/ring.c
+++ b/TP1/01_Ring/ring.c
@ -0,0 +1,74 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 int main(int argc, char *argv[])
 {
  int value;
  int my_rank, size;
  int previous, next;
  MPI_Status status;
  MPI_Init(NULL, NULL);
  // Get number of processes
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  // determine my neighbours according to my rank
  if (my_rank == 0)
  {
    previous = size - 1;
    next = my_rank + 1;
  }
  else if (my_rank == size - 1)
  {
    previous = my_rank - 1;
    next = 0;
  }
  else
  {
    previous = my_rank - 1;
    next = my_rank + 1;
  }
  value = 1;
  // The nodes, starting with node 0, transmit the value to each other,
  // each time multiplying it by 2.
  // At the end of the transmission, node 0 receives the value 2^(size-1)
  //
  // Instruction: before each send and after each receive, each node displays
  //   - its rank
  //   - the type communication (send, recv)
  //   - the value
  // receive value from previous node
  if (my_rank != 0)
  {
    MPI_Recv(&value, 1, MPI_INT, previous, 0, MPI_COMM_WORLD, &status);
    printf("RECEIVED from process %d of %d, value = %d\n", my_rank, size, value);
    value = value * 2;
  }
  else
  {
    printf("START, value = %d\n", value);
  }
  printf("SENDING from process %d of %d, value = %d\n", my_rank, size, value);
  // send value to next node
  if (my_rank != size - 1)
  {
    MPI_Send(&value, 1, MPI_INT, next, 0, MPI_COMM_WORLD);
  }
  else
  {
    printf("The End, value = %d\n", value);
  }
  MPI_Finalize();
  return EXIT_SUCCESS;
 }
--- a/TP1/02_Limite/Makefile
+++ b/TP1/02_Limite/Makefile
@ -0,0 +1,14 @@
 MPICC=smpicc
 CFLAGS=-g -O4
 all: limite
 clean:
 	rm -rf *.o limite
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $< 
 limite: limite.o
 	$(MPICC) -Dhave_mpi -o $@ $^ 
--- a/TP1/02_Limite/limite.c
+++ b/TP1/02_Limite/limite.c
@ -0,0 +1,86 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 int main(int argc, char *argv[])
 {
  int size;
  int my_rank;
  int data_size = -100;
  int *buffer_send, *buffer_recv;
  int tag;
  MPI_Status status;
  int l;
  char name[MPI_MAX_PROCESSOR_NAME];
  // Make sure that the command line has one argument (the size of the data)
  if (argc != 2)
  {
    printf("usage : limite <data size>\n");
    return EXIT_FAILURE;
  }
  MPI_Init(&argc, &argv);
  // Make sure exactly 2 MPI processes are used
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  if (size != 2)
  {
    printf("%d MPI processes used, please use 2.\n", size);
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
  }
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Get_processor_name(name, &l);
  printf("process %d of %d on processor named %s\n", my_rank, size, name);
  // Prepare parameters
  data_size = atoi(argv[1]);
  printf("The size of the data is %d\n", data_size);
  buffer_send = (int *)malloc(data_size * sizeof(int));
  buffer_recv = (int *)malloc(data_size * sizeof(int));
  buffer_send[0] = (my_rank == 0) ? 12345 : 67890;
  tag = 0;
  if (my_rank == 0)
  {
    // node 0 sends its buffer buffer_send of size data_size to node 1
    MPI_Send(buffer_send, data_size, MPI_INT, 1, tag, MPI_COMM_WORLD);
    // node 0 receives in its buffer buffer_recv data from node 1
    MPI_Recv(buffer_recv, data_size, MPI_INT, 1, tag, MPI_COMM_WORLD, &status);
    printf("MPI process %d received value %d from MPI process %d.\n", my_rank, buffer_recv[0], 1);
  }
  else
  {
    // node 1 sends its buffer buffer_send of size data_size to node 0
    MPI_Send(buffer_send, data_size, MPI_INT, 0, tag, MPI_COMM_WORLD);
    // node 1 receives in its buffer buffer_recv data from node 0
    MPI_Recv(buffer_recv, data_size, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
    printf("MPI process %d received value %d from MPI process %d.\n", my_rank, buffer_recv[0], 0);
  }
  free(buffer_send);
  free(buffer_recv);
  MPI_Finalize();
  return EXIT_SUCCESS;
 }
 // (a) rappelez pour quelle taille de message (petite, grande), MPI Send aura un comportement asynchrone (resp. synchrone)
 // ->
 // (b) que va-t-il se passer quand votre programme, compl ́et ́e comme indiqu ́e, sera appel ́e avec une taille de message qui fera que MPI Send sera synchrone ?
 // -> deadlock, on passe en synchrone
 // (c) estimez `a 10 entiers pr`es, la taille limite sur deux noeuds du mˆeme ordinateur ?
 // -> 16383
 // (d) proposez une solution pour que l’ ́echange entre les deux noeuds puissent se faire au del`a de cette limite (plusieurs r ́eponses possibles). Vous avez la possibilit ́e de les tester en dehors de la s ́eance.
 // -> découper le buffer de telle manière à n'envoyer que des petits buffers en asynchrone
 // -> changer ordre send/recv du deuxième noeud
--- a/TP1/03_Dot/Makefile
+++ b/TP1/03_Dot/Makefile
@ -0,0 +1,14 @@
 MPICC=smpicc
 CFLAGS=-g -O4
 all: dotp
 clean:
 	rm -rf *.o dotp
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $<
 dotp: dotp.o
 	$(MPICC) -o $@ $^  -lm
--- a/TP1/03_Dot/dotp.c
+++ b/TP1/03_Dot/dotp.c
@ -0,0 +1,91 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <mpi.h>
 // perform the dot product between the two vectors x and y of size n
 float dot(float x[], float y[], int n);
 int main(int argc, char *argv[])
 {
  int const local_data_size = 5;
  float local_x[local_data_size], local_y[local_data_size];
  float local_dot, global_dot1, global_dot2, reference;
  int borne;
  int my_rank, size;
  MPI_Init(NULL, NULL);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  borne = size * local_data_size - 1;
  reference = (float)(borne * (borne + 1) * (2 * borne + 1) / 6);
  // Initialization of both local vectors with the same values
  // the global vectors would be [0, 1, ..., local_data_size -1]
  for (int i = 0; i < local_data_size; i++)
  {
    local_x[i] = (float)(local_data_size * my_rank + i);
    local_y[i] = (float)(local_data_size * my_rank + i);
    // printf("[MPI process %d] value[%d]: %f\n", my_rank, i, local_x[i]);
  }
  local_dot = dot(local_x, local_y, local_data_size);
  printf("[MPI process %d] my local dot product: %f\n", my_rank, local_dot);
  /* Two-step operation */
  global_dot1 = 0.0;
  // Step 1
  // Use a collective communication to compute the global dot product
  // in such a way that the node 0 gets this value
  MPI_Reduce(&local_dot, &global_dot1, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
  // Node 0 displays the global value and the reference (sum of first integer ^ 2)
  if (my_rank == 0)
  {
    printf("[MPI process %d] *Two-step collective operation* global dot product: %f == %f\n", my_rank, global_dot1, reference);
  }
  // Step 2
  // Use a collective communication to broadcast the global value on each node
  MPI_Bcast(&global_dot1, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
  // A node i (i different from 0) displays the global value
  if (my_rank != 0)
  {
    printf("[MPI process %d] *Two-step collective operation* global dot product: %f == %f\n", my_rank, global_dot1, reference);
  }
  /* One-step operation */
  global_dot2 = 0;
  // Step 3
  // Now use one single collective communication to perfom both step 1 and 2
  MPI_Allreduce(&local_dot, &global_dot2, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
  // Another node displays the global value
  printf("[MPI process %d] *One-step collective operation* global dot product: %f == %f\n", my_rank, global_dot2, reference);
  MPI_Finalize();
  return EXIT_SUCCESS;
 }
 float dot(float x[], float y[], int n)
 {
  float res = 0.0;
  for (int i = 0; i < n; i++)
  {
    res += x[i] * y[i];
  }
  return res;
 }
--- a/TP1/04_Mult/Makefile
+++ b/TP1/04_Mult/Makefile
@ -0,0 +1,14 @@
 MPICC=smpicc
 CFLAGS=-g -O4
 all: MultAv
 clean:
 	rm -rf *.o MultAv
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $<
 MultAv: MultAv.o
 	$(MPICC) -o $@ $^ 
--- a/TP1/04_Mult/MultAv.c
+++ b/TP1/04_Mult/MultAv.c
@ -0,0 +1,119 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 void multAv(double x[], double *A, double y[], int m, int n);
 void init0(double x[], int n);
 int main(int argc, char *argv[])
 {
  int size;
  int const n = 12;
  int my_rank;
  MPI_Init(&argc, &argv);
  // Get number of processes and check that 4 processes are used
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  if (size != 4)
  {
    printf("This application is meant to be run with 4 MPI processes.\n");
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
  }
  // Get my rank
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  // Declaration and Initialization of A (one for all components)
  // the number of bloc of lines, b,  is the same for all node
  // (if you don't change the constants)
  int b = n / size;
  double *A;
  A = (double *)malloc(b * n * sizeof(double));
  for (int i = 0; i < b; i++)
  {
    for (int j = 0; j < n; j++)
    {
      A[i * n + j] = 1.0;
      // A[i*n + j] = (double) my_rank;
      // A[i*n + j] = (double) my_rank*(i+1)+(j+1);
      // printf("Process [%d], A[%d][%d] = %f\n", my_rank, i, j, A[i*n+j]);
    }
  }
  // reference vector to verify that the global vector is correct
  double v_ref[n];
  for (int i = 0; i < n; i++)
  {
    v_ref[i] = (double)i;
  }
  // local vector
  double v_local[b];
  for (int i = 0; i < b; i++)
  {
    v_local[i] = (double)b * my_rank + i;
    // printf("Process [%d], v_local[%d] = %f\n", my_rank, i, v_local[i]);
  }
  // global vector
  double v_global[n];
  init0(v_global, n);
  // Use a collective communication in order to gather on ALL the nodes the
  // part of the local vector into the global vector
  MPI_Allgather(v_local, b, MPI_DOUBLE, v_global, b, MPI_DOUBLE, MPI_COMM_WORLD);
  // the node 2 checks if the global vector is correct
  if (my_rank == 2)
  {
    for (int i = 0; i < n; i++)
    {
      printf("Process [%d], vérif[%d] = %f\n", my_rank, i, v_global[i] - v_ref[i]);
    }
  }
  MPI_Barrier(MPI_COMM_WORLD);
  // vector x_loc = A * v_global
  double x_loc[b];
  init0(x_loc, b);
  // Perform the multiplication
  multAv(x_loc, A, v_global, b, n);
  // each node displays x (with A, full of ones, all the components of x should be the same)
  for (int i = 0; i < b; i++)
  {
    printf("Process [%d], x_loc[%d] = %f\n", my_rank, i, x_loc[i]);
  }
  MPI_Finalize();
  return EXIT_SUCCESS;
 }
 void multAv(double x[], double *A, double y[], int m, int n)
 {
  for (int i = 0; i < m; i++)
  {
    x[i] = 0.0;
    for (int j = 0; j < n; j++)
    {
      x[i] += A[i * n + j] * y[j];
    }
  }
  return;
 }
 void init0(double x[], int n)
 {
  for (int i = 0; i < n; i++)
  {
    x[i] = 0.0;
  }
  return;
 }
--- a/TP1/05_CG/CG_par
+++ b/TP1/05_CG/CG_par
--- a/TP1/05_CG/CG_par.c
+++ b/TP1/05_CG/CG_par.c
@ -0,0 +1,121 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 #include <math.h>
 #include "util.h"
 void cg_par(double *A_local, double *rhs_local, int N, int b, float tol)
 {
  int size;
  int my_rank;
  // Get number of processes
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  // Get my rank
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  //**************** Parallel CG (M == N)
  int num_it, max_it;
  double x[b], r[b], Ap[b];
  double p_local[b], p_global[N];
  double nr_global, nr_local;
  double np2_global, np2_local;
  double epsilon;
  double alpha, beta;
  max_it = 100;
  // initialization of the solution (local vector)
  for (int i = 0; i < b; i++)
  {
    x[i] = 0.0;
  }
  // compute the global norm of the rhs_local (dot product, then sqrt);
  // all the nodes must have this value
  nr_local = dot(rhs_local, rhs_local, b);
  MPI_Allreduce(&nr_local, &nr_global, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  nr_global = sqrt(nr_global);
  // if (my_rank == 0) printf("nr = %lg\n", nr_global);
  // threshold of the CG
  epsilon = tol * nr_global;
  // Initialization of p_local and r (local vectors)
  copy_v(p_local, rhs_local, b);
  copy_v(r, rhs_local, b);
  // number of iterations
  num_it = 0;
  printf("num_it %d -- epsilon %lg -- nr_global %lg\n", num_it, epsilon, nr_global);
  while ((nr_global > epsilon) && (num_it < max_it))
  {
    // Compute the local vector Ap = A_local*p_global
    // => gather p_local vectors to p_global
    MPI_Allgather(p_local, b, MPI_DOUBLE, p_global, b, MPI_DOUBLE, MPI_COMM_WORLD);
    // display p_global
    if (my_rank == 0)
      printf("p_global = %lg\n", p_global);
    // do the matrix-vector multiplication
    multAv(Ap, A_local, p_global, b, N);
    // compute the global dot product np2_global = (Ap_global, p_global)
    // all the node must have this value
    np2_local = dot(p_local, Ap, b);
    MPI_Allreduce(&np2_local, &np2_global, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    if (my_rank == 0)
      printf("np2 = %lg\n", np2_global);
    // alpha
    alpha = (nr_global * nr_global) / np2_global;
    // if(my_rank == 0) printf("alpha = %lg\n", alpha);
    // compute the new x and r (local vectors)
    axpy(alpha, x, p_local, b);
    axpy(-alpha, r, Ap, b);
    // compute the global norm of the residual (dot product, then sqrt);
    // all the nodes must have this value
    nr_local = dot(r, r, b);
    MPI_Allreduce(&nr_local, &nr_global, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    nr_global = sqrt(nr_global);
    // if(my_rank == 0) printf("nr = %lg\n", nr_global);
    // beta
    beta = (nr_global * nr_global) / (alpha * np2_global);
    // if(my_rank == 0) printf("beta = %lg\n", beta);
    // compute the new p_local (local vector)
    xpay(beta, r, p_local, b);
    // increase the number of iterations
    num_it++;
    // if(my_rank == 0) printf("num_it %d -- nr_global %lg\n", num_it, nr_global);
  }
  free(A_local);
  // gather the solution on the node 0
  double x_global[N];
  MPI_Gather(x, b, MPI_DOUBLE, x_global, b, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  // display the solution
  if (my_rank == 0)
  {
    for (int i = 0; i < N; i++)
    {
      printf("x[%d] = %lg\n", i, x_global[i]);
    }
  }
  return;
 }
--- a/TP1/05_CG/CG_par.h
+++ b/TP1/05_CG/CG_par.h
@ -0,0 +1 @@
 void cg_par(double *A_local, double *rhs, int N, int b, float tol);
--- a/TP1/05_CG/CG_sq.c
+++ b/TP1/05_CG/CG_sq.c
@ -0,0 +1,85 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include "util.h"
 void cg_sq(double *A, double *rhs, int N, double tol)
 {
  int num_it, max_it;
  double x[N], p[N], r[N], Ap[N];
  double nr;
  double epsilon;
  double np2, alpha, beta;
  max_it = 100;
  // initialization of the solution
  for (int i = 0; i < N; i++)
  {
    // b[i] = (float) i;
    x[i] = 0.0;
  }
  // compute the norm of the rhs (dot product, then sqrt)
  nr = dot(rhs, rhs, N);
  nr = sqrt(nr);
  printf("nr = %lg\n", nr);
  // threshold of the CG
  epsilon = tol * nr;
  // Initialization of p and r
  copy_v(p, rhs, N);
  copy_v(r, rhs, N);
  // number of iterations
  num_it = 0;
  printf("num_it %d -- epsilon %lg -- nr %lg\n", num_it, epsilon, nr);
  while ((nr > epsilon) && (num_it < max_it))
  {
    // Compute the vector Ap = A*p
    multAv(Ap, A, p, N, N);
    // compute the dot product np2 = (Ap, p)
    np2 = dot(p, Ap, N);
    printf("np2 = %lg\n", np2);
    // alpha
    alpha = (nr * nr) / np2;
    // printf("alpha = %lg\n", alpha);
    // compute the new x and r
    axpy(alpha, x, p, N);
    axpy(-alpha, r, Ap, N);
    // compute the norm of the residual (dot product, then sqrt)
    nr = dot(r, r, N);
    nr = sqrt(nr);
    // printf("nr = %lg\n", nr);
    // beta
    beta = (nr * nr) / (alpha * np2);
    // printf("beta = %lg\n", beta);
    // compute the new p
    xpay(beta, r, p, N);
    // increase the number of iterations
    num_it++;
    // printf("num_it %d -- nr %lg \n", num_it, nr);
  }
  // display the solution
  for (int i = 0; i < N; i++)
  {
    printf("x[%d] = %lg\n", i, x[i]);
  }
  return;
 }
--- a/TP1/05_CG/CG_sq.h
+++ b/TP1/05_CG/CG_sq.h
@ -0,0 +1 @@
 void cg_sq(double *A, double *rhs, int N, double tol);
--- a/TP1/05_CG/Laplacien.mtx
+++ b/TP1/05_CG/Laplacien.mtx
@ -0,0 +1,21 @@
 %%MatrixMarket matrix coordinate real symmetric
 %-------------------------------------------------------------------------------
 % UF Sparse Matrix Collection, Tim Davis
 % http://www.cise.ufl.edu/research/sparse/matrices/HB/nos3
 % name: HB/nos3
 % [SYMMETRIC MATRIX, FE APPROXIMATION TO BIHARMONIC OPERATOR ON PLATE]
 % id: 219
 % date: 1982
 % author: H. Simon
 % ed: I. Duff, R. Grimes, J. Lewis
 % fields: title A name id date author ed kind
 % kind: structural problem
 %-------------------------------------------------------------------------------
 4 4 7
 1 1 2.0
 1 2 -1.0
 2 2 2.0
 2 3 -1.0
 3 3 2.0
 3 4 -1.0
 4 4 2.0
--- a/TP1/05_CG/Makefile
+++ b/TP1/05_CG/Makefile
@ -0,0 +1,18 @@
 CC=gcc
 MPICC=smpicc
 CFLAGS=-g -O4
 all: CG_par CG_sq
 clean:
 	rm -rf *.o CG_par CG_sq
 %.o: %.c
 	echo $@
 	$(MPICC) -c -Wall -o $@ $<
 CG_par: util.o CG_par.o main_par.o
 	$(MPICC) -o $@ $^  -lm
 CG_sq: util.o CG_sq.o main_sq.o
 	$(MPICC) -o $@ $^  -lm
--- a/TP1/05_CG/main_par.c
+++ b/TP1/05_CG/main_par.c
@ -0,0 +1,98 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 #include <math.h>
 #include "util.h" 
 #include "CG_par.h"
 int main(int argc, char* argv[]) {
  int size;
  int my_rank;
  FILE *f;
  int M, N, nz;
  double *A = NULL;
  double *rhs;
  double tol = 1e-6;
  // Make sure that the command line has one argument (name of the matrix file)
  if(argc != 2){
    printf("usage : CG_par <file>\n");
    return EXIT_FAILURE;
  }
  //**************** MPI Initialization
  MPI_Init(&argc, &argv);
  // Get number of processes and check that 4 processes are used
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  if(size != 4) {
    printf("This application is meant to be run with 4 MPI processes.\n");
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
  }
  // Get my rank
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  //**************** READING OF THE MATRICE AND DISTRIBUTION OF THE BLOCS OF LINES TO EACH NODE
  // You have the possibility to test with a small matrice ("Laplacien.txt")
  // or a larger one ("nos3.mtx")
  f = fopen(argv[1], "r");
  // All nodes get the sizes
  mm_read_mtx_crd_size(f, &M, &N, &nz);
  //printf("%d %d %d\n", M, N, nz);
  // Reading the matrix by node 0
  if(my_rank == 0) {
    A = (double *) malloc(M*N*sizeof(double));
    read_A(f, A, M, N, nz);
    // increase diagonal to be sure to converge easily
    for (int i = 0; i < M; i++) {
      *(A+i*N+i) = *(A+i*N+i) + 10.0;
    }
  }
  if (f != stdin) fclose(f);
  // DISTRIBUTION OF BLOCS => A_local(b, N)
  int b = M / size;
  double *A_local;
  A_local = (double *) malloc(b*N*sizeof(double));
  MPI_Scatter(A, b*N, MPI_DOUBLE, A_local, b*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  if(my_rank == 0) free(A);
  //**************** END OF THE READING OF THE MATRICE AND THE DISTRIBUTION OF THE BLOCS OF LINES TO EACH NODE
  //**************** PARALLEL CG (M == N)
  rhs = (double *) malloc(b*sizeof(double));
  // initialization of the right hand side (local vector)
  for(int i = 0; i < b; i++){
    rhs[i] = (float) (b*my_rank + i);
  }
  cg_par(A_local, rhs, N, b, tol);
  //**************** END OF PARALLEL CG
  MPI_Finalize();
  printf("The End\n");
  return EXIT_SUCCESS;
 }
--- a/TP1/05_CG/main_sq.c
+++ b/TP1/05_CG/main_sq.c
@ -0,0 +1,79 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <mpi.h>
 #include "util.h" 
 #include "CG_sq.h" 
 int main(int argc, char* argv[]) {
  int size;
  FILE *f;
  int M, N, nz;
  double *A = NULL;
  double *rhs;
  double tol = 1e-6;
  // Make sure that the command line has one argument (name of the matrix file)
  if(argc != 2){
    printf("usage : CG_sq <file>\n");
    return EXIT_FAILURE;
  }  
  MPI_Init(&argc, &argv);
  // Get number of processes and check that only 1 process is used
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  if(size != 1) {
    printf("This application is meant to be run with 1 MPI process.\n");
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
  }
  //**************** READING THE MATRICE
  // You have the possibility to test with a small matrice ("Laplacien.txt")
  // or a larger one ("nos3.mtx")
  f = fopen(argv[1], "r");
  mm_read_mtx_crd_size(f, &M, &N, &nz);
  //printf("%d %d %d\n", M, N, nz);
  A = (double *) malloc(M*N*sizeof(double));
  read_A(f, A, M, N, nz);
  // increase diagonal to be sure to converge easily
  for (int i = 0; i < M; i++) {
    *(A+i*N+i) = *(A+i*N+i) + 10.0;
  }
  if (f !=stdin) fclose(f);
  //**************** END OF READING THE MATRICE
  //**************** SEQUENTIAL CG (M == N)
  rhs = (double *) malloc(N*sizeof(double));
  // initialization of the right-hand side
  for(int i = 0; i < N; i++){
    rhs[i] = (float) i;
  }
  cg_sq(A, rhs, N, tol);
  //**************** END OF SEQUENTIAL CG
  MPI_Finalize();
  printf("The End\n");
  return EXIT_SUCCESS;
 }
--- a/TP1/05_CG/nos3.mtx
+++ b/TP1/05_CG/nos3.mtx
--- a/TP1/05_CG/util.c
+++ b/TP1/05_CG/util.c
@ -0,0 +1,114 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <ctype.h>
 #include <string.h>
 #include "util.h"
 void multAv(double x[], double *A, double y[], int m, int n){
  for(int i = 0; i < m; i++){
    x[i] = 0.0;
    for(int j = 0; j < n; j++){
      x[i] += A[i*n + j] * y[j];
    }
  }
  return;
 }
 void copy_v(double x[], double y[], int n){
  for(int i = 0; i < n; i++){
    x[i] = y[i];
  }
  return;
 }
 double dot(double x[], double y[], int n){
  double res = 0.0;
  for(int i = 0; i < n; i++){
    res += x[i]*y[i];
  }
  return res;
 }
 void axpy(double a, double x[], double y[], int n){
  for(int i = 0; i < n; i++){
    x[i] = x[i] + a*y[i];
  }
  return;
 }
 void xpay(double a, double x[], double y[], int n){
  for(int i = 0; i < n; i++){
    y[i] = x[i] + a*y[i];
  }
  return;
 }
 int read_A(FILE *f, double *A, int M, int N, int nz){
  int i, j, k;
  double val;
  int error;
  for (i = 0; i < M; i++) {
    for(j = 0; j < N; j++) {
      *(A+i*N+j) = 0.0;
    }
  }
  for (k = 0; k < nz; k++) {
    error = fscanf(f, "%d %d %lg\n", &i, &j, &val);
    if(!error) exit(0);
    //printf("-- %d -- %d -- %lg\n", i, j, val);
    *(A + (i-1)*N + (j-1)) = val;
    // this is a symmetric matrix
    *(A + (j-1)*N + (i-1)) = val;
  }
  /*
  for (k = 0; k < nz; k++) {
    printf("---- %lg\n",  *(A+k));
  }
  */
  return 0;
 }
 int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
 {
    char line[MM_MAX_LINE_LENGTH];
    int num_items_read;
    /* set return null parameter values, in case we exit with errors */
    *M = *N = *nz = 0;
    /* now continue scanning until you reach the end-of-comments */
    do 
    {
        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
            return MM_PREMATURE_EOF;
    }while (line[0] == '%');
    /* line[] is either blank or has M,N, nz */
    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
        return 0;
    else
    do
    { 
        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
        if (num_items_read == EOF) return MM_PREMATURE_EOF;
    }
    while (num_items_read != 3);
    return 0;
 }
--- a/TP1/05_CG/util.h
+++ b/TP1/05_CG/util.h
@ -0,0 +1,21 @@
 #include <ctype.h>
 #define MM_MAX_LINE_LENGTH 1025
 #define MatrixMarketBanner "%%MatrixMarket"
 #define MM_MAX_TOKEN_LENGTH 64
 #define MM_PREMATURE_EOF		12
 void multAv(double x[], double *A, double y[], int m, int n);
 void copy_v(double x[], double y[], int n);
 double dot(double x[], double y[], int n);
 void axpy(double a, double x[], double y[], int n);
 void xpay(double a, double x[], double y[], int n);
 int read_A(FILE *f, double *A, int M, int N, int nz);
 int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
--- a/TP1/init.sh
+++ b/TP1/init.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 SIMGRID=/mnt/n7fs/ens/tp_guivarch/opt2021/simgrid-3.31
 export PATH=${SIMGRID}/bin:${PATH}
 alias smpirun="smpirun -hostfile ${SIMGRID}/archis/cluster_hostfile.txt -platform ${SIMGRID}/archis/cluster_crossbar.xml"
--- a/TP1/tp_mpi.pdf
+++ b/TP1/tp_mpi.pdf
--- a/TP2/.vscode/settings.json
+++ b/TP2/.vscode/settings.json
@ -0,0 +1,7 @@
 {
    "files.associations": {
        "*.html": "html",
        "*.toml": "toml",
        "*.bak": "c"
    }
 }
--- a/TP2/Makefile
+++ b/TP2/Makefile
@ -0,0 +1,26 @@
 CC=gcc
 MPICC=smpicc
 LD=smpicc
 LDFLAGS=
 CFLAGS=-O4
 CLIBS=-lblas -llapack 
 INCLUDES=
 SOURCEDIR=src
 BUILDDIR=build
 all: dir main # test 
 test_env: dir who_am_i
 dir:
 	mkdir -p $(BUILDDIR)/bin
 clean:
 	rm -rf $(BUILDDIR)
 %.o: $(SOURCEDIR)/%.c
 	echo $@
 	$(MPICC) -c -Wall -o $(BUILDDIR)/$@ $< $(CFLAGS) $(INCLUDES)
 main: main.o gemms.o ex1.o ex2.o ex3.o utils.o dsmat.o
 	$(LD) -o $(BUILDDIR)/bin/$@ $(addprefix $(BUILDDIR)/,$^) $(CLIBS) $(LDFLAGS)
--- a/TP2/README
+++ b/TP2/README
@ -0,0 +1 @@
 https://laurent.fainsin.bzh/assets/CalcPar/
--- a/TP2/bench.csv
+++ b/TP2/bench.csv
@ -0,0 +1,151 @@
 m,n,k,b,p,q,algo,lookahead,gflops
 1024,1024,1024,256,2,2,p2p,0,7.475035
 1024,1024,1024,256,2,2,p2p,0,7.475035
 1024,1024,1024,256,2,2,p2p,0,7.475036
 1024,1024,1024,256,2,2,p2p,0,7.475036
 1024,1024,1024,256,2,2,p2p,0,7.475036
 1024,1024,1024,256,2,2,bcast,0,7.471268
 1024,1024,1024,256,2,2,bcast,0,7.471269
 1024,1024,1024,256,2,2,bcast,0,7.471268
 1024,1024,1024,256,2,2,bcast,0,7.471268
 1024,1024,1024,256,2,2,bcast,0,7.471269
 1024,1024,1024,256,2,2,p2p-i-la,1,14.306685
 1024,1024,1024,256,2,2,p2p-i-la,1,14.306689
 1024,1024,1024,256,2,2,p2p-i-la,1,14.306691
 1024,1024,1024,256,2,2,p2p-i-la,1,14.306689
 1024,1024,1024,256,2,2,p2p-i-la,1,14.306691
 1024,1024,1024,256,2,2,p2p-i-la,2,9.856253
 1024,1024,1024,256,2,2,p2p-i-la,2,9.856253
 1024,1024,1024,256,2,2,p2p-i-la,2,9.856254
 1024,1024,1024,256,2,2,p2p-i-la,2,9.856254
 1024,1024,1024,256,2,2,p2p-i-la,2,9.856254
 1024,1024,1024,256,2,2,p2p-i-la,3,14.317787
 1024,1024,1024,256,2,2,p2p-i-la,3,14.317789
 1024,1024,1024,256,2,2,p2p-i-la,3,14.317793
 1024,1024,1024,256,2,2,p2p-i-la,3,14.317793
 1024,1024,1024,256,2,2,p2p-i-la,3,14.317793
 1024,1024,1024,256,2,2,p2p-i-la,4,14.317787
 1024,1024,1024,256,2,2,p2p-i-la,4,14.317787
 1024,1024,1024,256,2,2,p2p-i-la,4,14.317793
 1024,1024,1024,256,2,2,p2p-i-la,4,14.317793
 1024,1024,1024,256,2,2,p2p-i-la,4,14.317793
 2048,2048,2048,256,2,2,p2p,0,14.951931
 2048,2048,2048,256,2,2,p2p,0,14.951932
 2048,2048,2048,256,2,2,p2p,0,14.951932
 2048,2048,2048,256,2,2,p2p,0,14.951929
 2048,2048,2048,256,2,2,p2p,0,14.951932
 2048,2048,2048,256,2,2,bcast,0,14.950045
 2048,2048,2048,256,2,2,bcast,0,14.950048
 2048,2048,2048,256,2,2,bcast,0,14.950048
 2048,2048,2048,256,2,2,bcast,0,14.950046
 2048,2048,2048,256,2,2,bcast,0,14.950046
 2048,2048,2048,256,2,2,p2p-i-la,1,28.642430
 2048,2048,2048,256,2,2,p2p-i-la,1,28.642433
 2048,2048,2048,256,2,2,p2p-i-la,1,28.642433
 2048,2048,2048,256,2,2,p2p-i-la,1,28.642433
 2048,2048,2048,256,2,2,p2p-i-la,1,28.642436
 2048,2048,2048,256,2,2,p2p-i-la,2,23.366289
 2048,2048,2048,256,2,2,p2p-i-la,2,23.366289
 2048,2048,2048,256,2,2,p2p-i-la,2,23.366289
 2048,2048,2048,256,2,2,p2p-i-la,2,23.366289
 2048,2048,2048,256,2,2,p2p-i-la,2,23.366289
 2048,2048,2048,256,2,2,p2p-i-la,3,28.653563
 2048,2048,2048,256,2,2,p2p-i-la,3,28.653569
 2048,2048,2048,256,2,2,p2p-i-la,3,28.653569
 2048,2048,2048,256,2,2,p2p-i-la,3,28.653566
 2048,2048,2048,256,2,2,p2p-i-la,3,28.653569
 2048,2048,2048,256,2,2,p2p-i-la,4,23.369989
 2048,2048,2048,256,2,2,p2p-i-la,4,23.369989
 2048,2048,2048,256,2,2,p2p-i-la,4,23.369991
 2048,2048,2048,256,2,2,p2p-i-la,4,23.369991
 2048,2048,2048,256,2,2,p2p-i-la,4,23.369991
 2048,2048,2048,256,2,2,p2p-i-la,5,28.653569
 2048,2048,2048,256,2,2,p2p-i-la,5,28.653575
 2048,2048,2048,256,2,2,p2p-i-la,5,28.653575
 2048,2048,2048,256,2,2,p2p-i-la,5,28.653575
 2048,2048,2048,256,2,2,p2p-i-la,5,28.653575
 2048,2048,2048,256,2,2,p2p-i-la,6,23.369991
 2048,2048,2048,256,2,2,p2p-i-la,6,23.369991
 2048,2048,2048,256,2,2,p2p-i-la,6,23.369991
 2048,2048,2048,256,2,2,p2p-i-la,6,23.369991
 2048,2048,2048,256,2,2,p2p-i-la,6,23.369991
 2048,2048,2048,256,2,2,p2p-i-la,7,28.659105
 2048,2048,2048,256,2,2,p2p-i-la,7,28.659105
 2048,2048,2048,256,2,2,p2p-i-la,7,28.659105
 2048,2048,2048,256,2,2,p2p-i-la,7,28.659105
 2048,2048,2048,256,2,2,p2p-i-la,7,28.659105
 2048,2048,2048,256,2,2,p2p-i-la,8,28.659102
 2048,2048,2048,256,2,2,p2p-i-la,8,28.659105
 2048,2048,2048,256,2,2,p2p-i-la,8,28.659105
 2048,2048,2048,256,2,2,p2p-i-la,8,28.659105
 2048,2048,2048,256,2,2,p2p-i-la,8,28.659105
 3072,3072,3072,256,2,2,p2p,0,22.428405
 3072,3072,3072,256,2,2,p2p,0,22.428407
 3072,3072,3072,256,2,2,p2p,0,22.428407
 3072,3072,3072,256,2,2,p2p,0,22.428407
 3072,3072,3072,256,2,2,p2p,0,22.428407
 3072,3072,3072,256,2,2,bcast,0,22.427149
 3072,3072,3072,256,2,2,bcast,0,22.427149
 3072,3072,3072,256,2,2,bcast,0,22.427152
 3072,3072,3072,256,2,2,bcast,0,22.427149
 3072,3072,3072,256,2,2,bcast,0,22.427152
 3072,3072,3072,256,2,2,p2p-i-la,1,42.976658
 3072,3072,3072,256,2,2,p2p-i-la,1,42.976662
 3072,3072,3072,256,2,2,p2p-i-la,1,42.976658
 3072,3072,3072,256,2,2,p2p-i-la,1,42.976662
 3072,3072,3072,256,2,2,p2p-i-la,1,42.976662
 3072,3072,3072,256,2,2,p2p-i-la,2,33.027327
 3072,3072,3072,256,2,2,p2p-i-la,2,33.027327
 3072,3072,3072,256,2,2,p2p-i-la,2,33.027327
 3072,3072,3072,256,2,2,p2p-i-la,2,33.027330
 3072,3072,3072,256,2,2,p2p-i-la,2,33.027327
 3072,3072,3072,256,2,2,p2p-i-la,3,42.987825
 3072,3072,3072,256,2,2,p2p-i-la,3,42.987825
 3072,3072,3072,256,2,2,p2p-i-la,3,42.987829
 3072,3072,3072,256,2,2,p2p-i-la,3,42.987818
 3072,3072,3072,256,2,2,p2p-i-la,3,42.987822
 3072,3072,3072,256,2,2,p2p-i-la,4,37.356416
 3072,3072,3072,256,2,2,p2p-i-la,4,37.356414
 3072,3072,3072,256,2,2,p2p-i-la,4,37.356422
 3072,3072,3072,256,2,2,p2p-i-la,4,37.356416
 3072,3072,3072,256,2,2,p2p-i-la,4,37.356416
 3072,3072,3072,256,2,2,p2p-i-la,5,42.991522
 3072,3072,3072,256,2,2,p2p-i-la,5,42.991526
 3072,3072,3072,256,2,2,p2p-i-la,5,42.991526
 3072,3072,3072,256,2,2,p2p-i-la,5,42.991526
 3072,3072,3072,256,2,2,p2p-i-la,5,42.991522
 3072,3072,3072,256,2,2,p2p-i-la,6,37.359194
 3072,3072,3072,256,2,2,p2p-i-la,6,37.359194
 3072,3072,3072,256,2,2,p2p-i-la,6,37.359194
 3072,3072,3072,256,2,2,p2p-i-la,6,37.359194
 3072,3072,3072,256,2,2,p2p-i-la,6,37.359197
 3072,3072,3072,256,2,2,p2p-i-la,7,42.991526
 3072,3072,3072,256,2,2,p2p-i-la,7,42.991538
 3072,3072,3072,256,2,2,p2p-i-la,7,42.991534
 3072,3072,3072,256,2,2,p2p-i-la,7,42.991534
 3072,3072,3072,256,2,2,p2p-i-la,7,42.991534
 3072,3072,3072,256,2,2,p2p-i-la,8,37.359200
 3072,3072,3072,256,2,2,p2p-i-la,8,37.359202
 3072,3072,3072,256,2,2,p2p-i-la,8,37.359202
 3072,3072,3072,256,2,2,p2p-i-la,8,37.359205
 3072,3072,3072,256,2,2,p2p-i-la,8,37.359202
 3072,3072,3072,256,2,2,p2p-i-la,9,42.991549
 3072,3072,3072,256,2,2,p2p-i-la,9,42.991549
 3072,3072,3072,256,2,2,p2p-i-la,9,42.991549
 3072,3072,3072,256,2,2,p2p-i-la,9,42.991545
 3072,3072,3072,256,2,2,p2p-i-la,9,42.991545
 3072,3072,3072,256,2,2,p2p-i-la,10,37.359205
 3072,3072,3072,256,2,2,p2p-i-la,10,37.359202
 3072,3072,3072,256,2,2,p2p-i-la,10,37.359202
 3072,3072,3072,256,2,2,p2p-i-la,10,37.359214
 3072,3072,3072,256,2,2,p2p-i-la,10,37.359202
 3072,3072,3072,256,2,2,p2p-i-la,11,42.995159
 3072,3072,3072,256,2,2,p2p-i-la,11,42.995159
 3072,3072,3072,256,2,2,p2p-i-la,11,42.995144
 3072,3072,3072,256,2,2,p2p-i-la,11,42.995167
 3072,3072,3072,256,2,2,p2p-i-la,11,42.995152
 3072,3072,3072,256,2,2,p2p-i-la,12,42.995159
 3072,3072,3072,256,2,2,p2p-i-la,12,42.995159
 3072,3072,3072,256,2,2,p2p-i-la,12,42.995152
 3072,3072,3072,256,2,2,p2p-i-la,12,42.995171
 3072,3072,3072,256,2,2,p2p-i-la,12,42.995159
--- a/TP2/bench.sh
+++ b/TP2/bench.sh
@ -0,0 +1,39 @@
 source utils.sh
 echo BENCHMARKING THE METHODS
 # you can modify these values
 p=2
 q=2
 P=$((p * q))
 #generate_hostfile $P
 export OMP_NUM_THREADS=1
 export MKL_NUM_THREADS=1
 # proper benchmark <--- this could be a TODO for students ? (as in, show weak scaling and/or strong scaling)
 #mpi_options="-hostfile hostfiles/hostfile.$P.txt"
 mpi_options="-platform platforms/cluster_crossbar.xml -hostfile hostfiles/cluster_hostfile.txt -np $P"
 b=256
 iter=5
 traces="bench_traces"
 out="bench_outputs"
 csv="bench.csv"
 echo m,n,k,b,p,q,algo,lookahead,gflops >$csv
 for i in 4 8 12; do
  n=$((i * b))
  m=$n
  k=$n
  la=0
  options="-c"
  for algo in p2p bcast; do
    run
  done
  for la in $(seq 1 $((n / b))); do
    algo="p2p-i-la"
    options="-c -l $la"
    run
  done
 done
--- a/TP2/check.csv
+++ b/TP2/check.csv
@ -0,0 +1,16 @@
 m,n,k,b,p,q,algo,lookahead,gflops
 2,2,2,2,2,2,p2p,0,0.000172
 2,2,2,2,2,2,p2p,0,0.000172
 2,2,2,2,2,2,p2p,0,0.000172
 2,2,2,2,2,2,p2p,0,0.000172
 2,2,2,2,2,2,p2p,0,0.000172
 2,2,2,2,2,2,bcast,0,0.000075
 2,2,2,2,2,2,bcast,0,0.000075
 2,2,2,2,2,2,bcast,0,0.000075
 2,2,2,2,2,2,bcast,0,0.000075
 2,2,2,2,2,2,bcast,0,0.000075
 2,2,2,2,2,2,p2p-i-la,1,0.000223
 2,2,2,2,2,2,p2p-i-la,1,0.000223
 2,2,2,2,2,2,p2p-i-la,1,0.000223
 2,2,2,2,2,2,p2p-i-la,1,0.000223
 2,2,2,2,2,2,p2p-i-la,1,0.000223
--- a/TP2/check.sh
+++ b/TP2/check.sh
@ -0,0 +1,39 @@
 source utils.sh
 echo BENCHMARKING THE METHODS
 # you can modify these values
 p=2
 q=2
 P=$((p * q))
 #generate_hostfile $P
 export OMP_NUM_THREADS=1
 export MKL_NUM_THREADS=1
 # proper benchmark <--- this could be a TODO for students ? (as in, show weak scaling and/or strong scaling)
 #mpi_options="-hostfile hostfiles/hostfile.$P.txt"
 mpi_options="-platform platforms/cluster_crossbar.xml -hostfile hostfiles/cluster_hostfile.txt -np 4"
 b=2
 iter=5
 traces="check_traces"
 out="check_outputs"
 csv="check.csv"
 echo m,n,k,b,p,q,algo,lookahead,gflops >$csv
 for i in 1; do
  n=$((i * b))
  m=$n
  k=$n
  la=0
  options="-c"
  for algo in p2p bcast; do
    run
  done
  for la in $(seq 1 $((n / b))); do
    algo="p2p-i-la"
    options="-c -l $la"
    run
  done
 done
--- a/TP2/hostfiles/cluster_hostfile.txt
+++ b/TP2/hostfiles/cluster_hostfile.txt
@ -0,0 +1,256 @@
 host-0.hawaii.edu
 host-1.hawaii.edu
 host-2.hawaii.edu
 host-3.hawaii.edu
 host-4.hawaii.edu
 host-5.hawaii.edu
 host-6.hawaii.edu
 host-7.hawaii.edu
 host-8.hawaii.edu
 host-9.hawaii.edu
 host-10.hawaii.edu
 host-11.hawaii.edu
 host-12.hawaii.edu
 host-13.hawaii.edu
 host-14.hawaii.edu
 host-15.hawaii.edu
 host-16.hawaii.edu
 host-17.hawaii.edu
 host-18.hawaii.edu
 host-19.hawaii.edu
 host-20.hawaii.edu
 host-21.hawaii.edu
 host-22.hawaii.edu
 host-23.hawaii.edu
 host-24.hawaii.edu
 host-25.hawaii.edu
 host-26.hawaii.edu
 host-27.hawaii.edu
 host-28.hawaii.edu
 host-29.hawaii.edu
 host-30.hawaii.edu
 host-31.hawaii.edu
 host-32.hawaii.edu
 host-33.hawaii.edu
 host-34.hawaii.edu
 host-35.hawaii.edu
 host-36.hawaii.edu
 host-37.hawaii.edu
 host-38.hawaii.edu
 host-39.hawaii.edu
 host-40.hawaii.edu
 host-41.hawaii.edu
 host-42.hawaii.edu
 host-43.hawaii.edu
 host-44.hawaii.edu
 host-45.hawaii.edu
 host-46.hawaii.edu
 host-47.hawaii.edu
 host-48.hawaii.edu
 host-49.hawaii.edu
 host-50.hawaii.edu
 host-51.hawaii.edu
 host-52.hawaii.edu
 host-53.hawaii.edu
 host-54.hawaii.edu
 host-55.hawaii.edu
 host-56.hawaii.edu
 host-57.hawaii.edu
 host-58.hawaii.edu
 host-59.hawaii.edu
 host-60.hawaii.edu
 host-61.hawaii.edu
 host-62.hawaii.edu
 host-63.hawaii.edu
 host-64.hawaii.edu
 host-65.hawaii.edu
 host-66.hawaii.edu
 host-67.hawaii.edu
 host-68.hawaii.edu
 host-69.hawaii.edu
 host-70.hawaii.edu
 host-71.hawaii.edu
 host-72.hawaii.edu
 host-73.hawaii.edu
 host-74.hawaii.edu
 host-75.hawaii.edu
 host-76.hawaii.edu
 host-77.hawaii.edu
 host-78.hawaii.edu
 host-79.hawaii.edu
 host-80.hawaii.edu
 host-81.hawaii.edu
 host-82.hawaii.edu
 host-83.hawaii.edu
 host-84.hawaii.edu
 host-85.hawaii.edu
 host-86.hawaii.edu
 host-87.hawaii.edu
 host-88.hawaii.edu
 host-89.hawaii.edu
 host-90.hawaii.edu
 host-91.hawaii.edu
 host-92.hawaii.edu
 host-93.hawaii.edu
 host-94.hawaii.edu
 host-95.hawaii.edu
 host-96.hawaii.edu
 host-97.hawaii.edu
 host-98.hawaii.edu
 host-99.hawaii.edu
 host-100.hawaii.edu
 host-101.hawaii.edu
 host-102.hawaii.edu
 host-103.hawaii.edu
 host-104.hawaii.edu
 host-105.hawaii.edu
 host-106.hawaii.edu
 host-107.hawaii.edu
 host-108.hawaii.edu
 host-109.hawaii.edu
 host-110.hawaii.edu
 host-111.hawaii.edu
 host-112.hawaii.edu
 host-113.hawaii.edu
 host-114.hawaii.edu
 host-115.hawaii.edu
 host-116.hawaii.edu
 host-117.hawaii.edu
 host-118.hawaii.edu
 host-119.hawaii.edu
 host-120.hawaii.edu
 host-121.hawaii.edu
 host-122.hawaii.edu
 host-123.hawaii.edu
 host-124.hawaii.edu
 host-125.hawaii.edu
 host-126.hawaii.edu
 host-127.hawaii.edu
 host-128.hawaii.edu
 host-129.hawaii.edu
 host-130.hawaii.edu
 host-131.hawaii.edu
 host-132.hawaii.edu
 host-133.hawaii.edu
 host-134.hawaii.edu
 host-135.hawaii.edu
 host-136.hawaii.edu
 host-137.hawaii.edu
 host-138.hawaii.edu
 host-139.hawaii.edu
 host-140.hawaii.edu
 host-141.hawaii.edu
 host-142.hawaii.edu
 host-143.hawaii.edu
 host-144.hawaii.edu
 host-145.hawaii.edu
 host-146.hawaii.edu
 host-147.hawaii.edu
 host-148.hawaii.edu
 host-149.hawaii.edu
 host-150.hawaii.edu
 host-151.hawaii.edu
 host-152.hawaii.edu
 host-153.hawaii.edu
 host-154.hawaii.edu
 host-155.hawaii.edu
 host-156.hawaii.edu
 host-157.hawaii.edu
 host-158.hawaii.edu
 host-159.hawaii.edu
 host-160.hawaii.edu
 host-161.hawaii.edu
 host-162.hawaii.edu
 host-163.hawaii.edu
 host-164.hawaii.edu
 host-165.hawaii.edu
 host-166.hawaii.edu
 host-167.hawaii.edu
 host-168.hawaii.edu
 host-169.hawaii.edu
 host-170.hawaii.edu
 host-171.hawaii.edu
 host-172.hawaii.edu
 host-173.hawaii.edu
 host-174.hawaii.edu
 host-175.hawaii.edu
 host-176.hawaii.edu
 host-177.hawaii.edu
 host-178.hawaii.edu
 host-179.hawaii.edu
 host-180.hawaii.edu
 host-181.hawaii.edu
 host-182.hawaii.edu
 host-183.hawaii.edu
 host-184.hawaii.edu
 host-185.hawaii.edu
 host-186.hawaii.edu
 host-187.hawaii.edu
 host-188.hawaii.edu
 host-189.hawaii.edu
 host-190.hawaii.edu
 host-191.hawaii.edu
 host-192.hawaii.edu
 host-193.hawaii.edu
 host-194.hawaii.edu
 host-195.hawaii.edu
 host-196.hawaii.edu
 host-197.hawaii.edu
 host-198.hawaii.edu
 host-199.hawaii.edu
 host-200.hawaii.edu
 host-201.hawaii.edu
 host-202.hawaii.edu
 host-203.hawaii.edu
 host-204.hawaii.edu
 host-205.hawaii.edu
 host-206.hawaii.edu
 host-207.hawaii.edu
 host-208.hawaii.edu
 host-209.hawaii.edu
 host-210.hawaii.edu
 host-211.hawaii.edu
 host-212.hawaii.edu
 host-213.hawaii.edu
 host-214.hawaii.edu
 host-215.hawaii.edu
 host-216.hawaii.edu
 host-217.hawaii.edu
 host-218.hawaii.edu
 host-219.hawaii.edu
 host-220.hawaii.edu
 host-221.hawaii.edu
 host-222.hawaii.edu
 host-223.hawaii.edu
 host-224.hawaii.edu
 host-225.hawaii.edu
 host-226.hawaii.edu
 host-227.hawaii.edu
 host-228.hawaii.edu
 host-229.hawaii.edu
 host-230.hawaii.edu
 host-231.hawaii.edu
 host-232.hawaii.edu
 host-233.hawaii.edu
 host-234.hawaii.edu
 host-235.hawaii.edu
 host-236.hawaii.edu
 host-237.hawaii.edu
 host-238.hawaii.edu
 host-239.hawaii.edu
 host-240.hawaii.edu
 host-241.hawaii.edu
 host-242.hawaii.edu
 host-243.hawaii.edu
 host-244.hawaii.edu
 host-245.hawaii.edu
 host-246.hawaii.edu
 host-247.hawaii.edu
 host-248.hawaii.edu
 host-249.hawaii.edu
 host-250.hawaii.edu
 host-251.hawaii.edu
 host-252.hawaii.edu
 host-253.hawaii.edu
 host-254.hawaii.edu
 host-255.hawaii.edu
--- a/TP2/hostfiles/hostfile.txt
+++ b/TP2/hostfiles/hostfile.txt
@ -0,0 +1,16 @@
 node-0.simgrid.org
 node-1.simgrid.org
 node-2.simgrid.org
 node-3.simgrid.org
 node-4.simgrid.org
 node-5.simgrid.org
 node-6.simgrid.org
 node-7.simgrid.org
 node-8.simgrid.org
 node-9.simgrid.org
 node-10.simgrid.org
 node-11.simgrid.org
 node-12.simgrid.org
 node-13.simgrid.org
 node-14.simgrid.org
 node-15.simgrid.org
--- a/TP2/init.sh
+++ b/TP2/init.sh
@ -0,0 +1,4 @@
 #!/bin/bash
 SIMGRID=/mnt/n7fs/ens/tp_guivarch/opt2021/simgrid-3.31
 export PATH=${SIMGRID}/bin:${PATH}
--- a/TP2/log.txt
+++ b/TP2/log.txt
@ -0,0 +1,117 @@
 File smpi_simgrid.trace
 Errors :
 150 : Unknown container: 0
 153 : Unknown container: 0
 156 : Unknown container: 0
 165 : Unknown container: 0
 168 : Unknown container: 0
 171 : Unknown container: 0
 185 : Unknown container: 0
 191 : Unknown container: 0
 199 : Unknown container: 0
 205 : Unknown container: 0
 207 : Unknown container: 0
 213 : Unknown container: 0
 216 : Unknown container: 0
 221 : Unknown container: 0
 223 : Unknown container: 0
 231 : Unknown container: 0
 236 : Unknown container: 0
 243 : Unknown container: 0
 275 : Unknown container: 0
 283 : Unknown container: 0
 285 : Unknown container: 0
 287 : Unknown container: 0
 294 : Unknown container: 0
 303 : Unknown container: 0
 362 : Unknown container: 0
 364 : Unknown container: 0
 366 : Unknown container: 0
 371 : Unknown container: 0
 373 : Unknown container: 0
 375 : Unknown container: 0
 380 : Unknown container: 0
 382 : Unknown container: 0
 384 : Unknown container: 0
 389 : Unknown container: 0
 391 : Unknown container: 0
 393 : Unknown container: 0
 398 : Unknown container: 0
 400 : Unknown container: 0
 402 : Unknown container: 0
 407 : Unknown container: 0
 409 : Unknown container: 0
 411 : Unknown container: 0
 416 : Unknown container: 0
 418 : Unknown container: 0
 420 : Unknown container: 0
 425 : Unknown container: 0
 427 : Unknown container: 0
 429 : Unknown container: 0
 434 : Unknown container: 0
 436 : Unknown container: 0
 438 : Unknown container: 0
 443 : Unknown container: 0
 445 : Unknown container: 0
 447 : Unknown container: 0
 570 : Unknown container: 0
 573 : Unknown container: 0
 576 : Unknown container: 0
 585 : Unknown container: 0
 588 : Unknown container: 0
 591 : Unknown container: 0
 604 : Unknown container: 0
 612 : Unknown container: 0
 619 : Unknown container: 0
 625 : Unknown container: 0
 627 : Unknown container: 0
 633 : Unknown container: 0
 635 : Unknown container: 0
 641 : Unknown container: 0
 643 : Unknown container: 0
 650 : Unknown container: 0
 656 : Unknown container: 0
 663 : Unknown container: 0
 695 : Unknown container: 0
 703 : Unknown container: 0
 705 : Unknown container: 0
 707 : Unknown container: 0
 713 : Unknown container: 0
 723 : Unknown container: 0
 782 : Unknown container: 0
 784 : Unknown container: 0
 786 : Unknown container: 0
 791 : Unknown container: 0
 793 : Unknown container: 0
 795 : Unknown container: 0
 800 : Unknown container: 0
 802 : Unknown container: 0
 804 : Unknown container: 0
 809 : Unknown container: 0
 811 : Unknown container: 0
 813 : Unknown container: 0
 818 : Unknown container: 0
 820 : Unknown container: 0
 822 : Unknown container: 0
 827 : Unknown container: 0
 829 : Unknown container: 0
 831 : Unknown container: 0
 836 : Unknown container: 0
 838 : Unknown container: 0
 840 : Unknown container: 0
 845 : Unknown container: 0
 847 : Unknown container: 0
 849 : Unknown container: 0
 854 : Unknown container: 0
 856 : Unknown container: 0
 858 : Unknown container: 0
 863 : Unknown container: 0
 865 : Unknown container: 0
 867 : Unknown container: 0
 Warnings :
 1 : the definition is not identified
 2 : the definition is not identified
 Your trace has 108 errors and 2 warnings.
--- a/TP2/platforms/cluster_crossbar.xml
+++ b/TP2/platforms/cluster_crossbar.xml
@ -0,0 +1,7 @@
 <?xml version='1.0'?>
 <!DOCTYPE platform SYSTEM "http://simgrid.gforge.inria.fr/simgrid/simgrid.dtd">
 <platform version="4.1">
    <zone id="AS0" routing="Full">
        <cluster id="my_cluster" prefix="host-" suffix=".hawaii.edu" radical="0-255" speed="1Gf" bw="125Mbps" lat="5us"/>
    </zone>
 </platform>
--- a/TP2/platforms/cluster_fat_tree.xml
+++ b/TP2/platforms/cluster_fat_tree.xml
@ -0,0 +1,17 @@
 <?xml version='1.0'?>
 <!DOCTYPE platform SYSTEM "https://simgrid.org/simgrid.dtd">
 <platform version="4.1">
  <!-- This is an example for a fat tree cluster.
  This is taken from figure 1(b) of the paper "D-Mod-K Routing Providing on-Blocking Traffic for Shift Permutations on
  Real Life Fat Trees" available at https://ece.technion.ac.il/wp-content/uploads/2021/01/publication_776.pdf
  This defines a two levels fat-tree, with 4 leaf switches connected to 4 nodes each and 2 core switches connected to
  each leaf switch by two cables -->
  <zone id="world" routing="Full">
    <cluster id="bob_cluster"
 	     prefix="node-" radical="0-15" suffix=".simgrid.org"
 	     speed="1Gf" bw="125MBps" lat="50us"
             topology="FAT_TREE" topo_parameters="2;4,4;1,2;1,2"
 	     loopback_bw="100MBps" loopback_lat="0" />
  </zone>
 </platform>
--- a/TP2/platforms/default.xml
+++ b/TP2/platforms/default.xml
@ -0,0 +1,17 @@
 <?xml version='1.0'?>
 <!DOCTYPE platform SYSTEM "https://simgrid.org/simgrid.dtd">
 <platform version="4.1">
  <!-- This is an example for a fat tree cluster.
  This is taken from figure 1(b) of the paper "D-Mod-K Routing Providing on-Blocking Traffic for Shift Permutations on
  Real Life Fat Trees" available at https://ece.technion.ac.il/wp-content/uploads/2021/01/publication_776.pdf
  This defines a two levels fat-tree, with 4 leaf switches connected to 4 nodes each and 2 core switches connected to
  each leaf switch by two cables -->
  <zone id="world" routing="Full">
    <cluster id="bob_cluster"
 	     prefix="node-" radical="0-15" suffix=".simgrid.org"
 	     speed="1Gf" bw="125MBps" lat="50us"
             topology="FAT_TREE" topo_parameters="2;4,4;1,2;1,2"
 	     loopback_bw="100MBps" loopback_lat="0" />
  </zone>
 </platform>
--- a/TP2/platforms/simgrid_update_xml.pl
+++ b/TP2/platforms/simgrid_update_xml.pl
@ -0,0 +1,277 @@
 #! /usr/bin/env perl
 eval 'exec perl -S $0 ${1+"$@"}'
    if $running_under_some_shell;
 # This script updates the simgrid XML file passed as argument (modification in place)
 # It is built to do the conversion incrementally.
 # Copyright (c) 2006-2022. The SimGrid Team.
 # All rights reserved.
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the license (GNU LGPL) which comes with this package.
 =encoding UTF-8
 =head1 NAME
 simgrid_update_xml - updates simgrid XML files to latest version
 =head1 SYNOPSIS
 B<simgrid_update_xml> I<xml_file>
 =head1 DESCRIPTION
 simgrid_update_xml updates the simgrid XML file passed as argument.  The file
 is modified in place, without any kind of backup. You may want to save a copy
 before running the script.
 In SimGrid XML files, the standard version is indicated in the version
 attribute of the platform tag. Current version is 4. Here is a list of major
 changes in each version.
 =over 4
 =item B<Version 0:> Used before SimGrid 3.3
 =item B<Version 1:> Introduced in SimGrid 3.3
 =over 4
 =item
 The version attribute of platform were added to allow file versioning.
 =item
 The link bandwidth changed from Mb/s to b/s; and the CPU power were changed
 from MFlop/s to Flop/s
 =back
 =item B<Version 2:> Introduced in SimGrid 3.4
 =over
 =item
 Several tags were renamed:
  CPU -> HOST
  NETWORK_LINK -> LINK
  ROUTE_ELEMENT ->  LINK_CTN
  PLATFORM_DESCRIPTION -> PLATFORM
 =back
 =item B<Version 3:> Introduced in SimGrid 3.5
 =over 4
 =item
 The AS tag were introduced. Every platform should now contain an englobing AS
 tag.
 =item
 Routes are now symmetric by default.
 =item
 Several tags were renamed (for sake of XML sanity):
  LINK:CTN -> LINK_CTN
  TRACE:CONNECT -> TRACE_CONNECT
 =back
 =item B<Version 4:> Introduced in SimGrid 3.13
 =over 4
 =item
 Rename the attributes describing the amount of flop that a host / peer / cluster / cabinet can deliver per second.
  <host power=...> -> <host speed=...>
 =item
 In <trace_connect>, attribute kind="POWER" is now kind="SPEED".
 =item
 The DOCTYPE points to the right URL.
 =item
 Units are now mandatory in attributes. USE THE SCRIPT sg_xml_unit_converter.py TO CONVERT THIS
     - speed. Old default: 'f' or 'flops'. Also defined:
        'Yf',         'Zf',         'Ef',       'Pf',        'Tf',        'Gf',        'Mf',        'kf'
        'yottaflops', 'zettaflops', 'exaflops', 'petaflops', 'teraflops', 'gigaflops', 'megaflops', 'kiloflops'
     - bandwidth. Old default: 'Bps' bytes per second (or 'bps' but 1 Bps = 8 bps)
       Also defined in bytes: 'TiBps', 'GiBps', 'MiBps', 'KiBps', 'TBps', 'GBps', 'MBps', 'kBps', 'Bps'
       And the same in bits:  'Tibps', 'Gibps', 'Mibps', 'Kibps', 'Tbps', 'Gbps', 'Mbps', 'kbps', 'bps'
     - latency. Old default: 's' second. Also defined:
       'w' week, 'd' day, 'h' hour, 'm' minute, 'ms' millisecond, 'us' microsecond, 'ns' nanosecond, 'ps' picosecond
 =back
 =item B<Version 4.1:> Introduced in SimGrid 3.16 (this is the current version).
 =over 4
 =item
 Rename a few tags, but in a backward-compatible manner: the old names are still accepted.
  AS            -> zone
  ASroute       -> zoneRoute
  bypassAsRoute -> bypassZoneRoute
  process       -> actor
 =back
 =item Other backward-compatible changes (old syntax is still accepted) for which we did not bump the DTD version:
 =over 4
 =item
 Rename the FULLDUPLEX sharing into SPLITDUPLEX.
 =item
 In <host> and <peer>, rename the 'availability_file' attribute into 'speed_file'.
 =back
 =back
 =head1 AUTHORS
 The SimGrid team
 =head1 COPYRIGHT AND LICENSE
 Copyright (c) 2006-2022. The SimGrid Team. All rights reserved.
 This program is free software; you may redistribute it and/or modify it
 under the terms of GNU LGPL (v2.1) license.
 =cut
 use strict;
 my $fromversion=-1;
 my $toversion=4.1;
 my $filename = $ARGV[0] or die "Usage: simgrid_update_xml.pl file_to_convert.xml\nPlease provide an XML to convert as a parameter.\n";
 open INPUT, "$filename" or die "Cannot open input file $filename: $!\n";
 my $output_string = "<?xml version='1.0'?>\n".
    "<!DOCTYPE platform SYSTEM \"https://simgrid.org/simgrid.dtd\">\n".
    "<platform version=\"$toversion\">\n";
 my($AS_opened)=0;
 my $line;
 while (defined($line = <INPUT>)) {
    chomp $line;
    # eat the header, whatever form it has
    next if ($line =~ s/<\?xml[^>]*>//           && ! $line =~ /\S/); # just in case several tags are on the same line
    next if ($line =~ s/<!DOCTYPE[^>]*>//        && ! $line =~ /\S/);
    if ($line =~ s/<platform(_description)? *>//) {
 	$fromversion = 0;
 	print "$filename was using version 0\n";
 	next if !$line =~ /\S/;
    } elsif ($line =~ s/<platform.*version=["']*([0-9.]*)["']*>//) {
 	$fromversion = $1;
 	if ($fromversion == $toversion) {
 	    warn "Input platform file $filename is already conformant to version $fromversion. This should be a no-op.\n";
 	}
 	if ($fromversion > $toversion) {
 	    die "Input platform file $filename is more recent than this script (file version: $fromversion; script version: $toversion)\n";
 	}
 	next if !$line =~ /\S/;
 	print "$filename was using version $fromversion\n";
    }
    if ($fromversion == 0) {
 	while ($line =~ m|^(.*?)<cpu(.*?)power="([^"]*)"(.*)$|) {
 	    $line = "$1TOTOTUTUTATA${2}TOTOTUTUTATA".($3*1000000)."TOTOTUTUTATA${4}";
 	}
 	while ($line =~ /^(.*?)TOTOTUTUTATA(.*?)TOTOTUTUTATA(.*?)TOTOTUTUTATA(.*)$/) {
 	    $line = "$1<cpu${2}power=\"$3\"$4";
 	}
 	while ($line =~ m|^(.*?)<network_link(.*?)bandwidth="([^"]*)"(.*?)$|) {
 	    $line = "$1TOTOTUTUTATA${2}TOTOTUTUTATA".($3*1000000)."TOTOTUTUTATA${4}";
 	}
 	while ($line =~ /^(.*?)TOTOTUTUTATA(.*?)TOTOTUTUTATA(.*?)TOTOTUTUTATA(.*?)$/) {
 	    $line = "$1<network_link${2}bandwidth=\"$3\"$4";
 	}
    }
    if ($fromversion < 2)  {
 	# The renamings (\b=zero-width word boundary check)
 	$line =~ s/\bplatform_description\b/platform/g;
 	$line =~ s/\bname\b/id/g;
 	$line =~ s/\bcpu\b/host/g;
 	$line =~ s/\bnetwork_link\b/link/g;
 	$line =~ s/\broute_element\b/link:ctn/g;
    }
    if ($fromversion < 3)  {
 	$line =~ s/\blink:ctn\b/link_ctn/g;
 	$line =~ s/\btrace:connect\b/trace_connect/g;
 	if($AS_opened && (($line=~ /<\/platform>/) || ($line=~ /<process/))) {
 	    $output_string .= "</AS>\n";
 	    $AS_opened = 0;
 	}
 	if( (!$AS_opened) && (
 		($line =~ /<host/)    ||
 		($line =~ /<link/)    ||
 		($line =~ /<cluster/) ||
 		($line =~ /<router/)
 	    )) {
 	    $output_string .=  " <AS  id=\"AS0\"  routing=\"Full\">\n";
 	    $AS_opened=1;
 	}
 	if($line=~/<route /){$line =~ s/\<route/\<route symmetrical=\"NO\"/g;}
    }
    if ($fromversion < 4) {
 	$line =~ s/\bpower\b/speed/g;
 	$line =~ s/\bkind="POWER"/kind="SPEED"/g;
    }
    if ($fromversion < 4.1) {
 	$line =~ s/\bAS\b/zone/g;
 	$line =~ s/\bASroute\b/zoneRoute/g;
 	$line =~ s/\bbypassAsRoute\b/bypassZoneRoute/g;
 	$line =~ s/\bprocess\b/actor/g;
    }
    $line =~ s/\bFULLDUPLEX\b/SPLITDUPLEX/g;
    $line =~ s/\bavailability_file\b/speed_file/g;
    $output_string .= "$line\n";
 }
 close INPUT;
 if ($fromversion == -1) {
    die "Cannot retrieve the platform version of $filename\n";
 }
 open OUTPUT, "> $filename";
 print OUTPUT $output_string;
 close OUTPUT;
--- a/TP2/src/dsmat.c
+++ b/TP2/src/dsmat.c
@ -0,0 +1,360 @@
 #include <mpi.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <cblas.h>
 #include "simgrid/actor.h"
 #include <simgrid/exec.h>
 #include "utils.h"
 #include "dsmat.h"
 /* Tracing purposes */
 static char* COMPUTE = "Computing";
 static char* IDLE = "Idling";
 void init_trace() {
 //  TRACE_host_state_declare(COMPUTE);
 //  TRACE_host_state_declare(IDLE);
 }
 int dsmat_fill(Matrix* a, int m, int n, int b, int p, int q, char* name) {
  int me, node;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  int mb = m/b, nb = n/b;
  int ii, jj;
  int row, col;
  a->mb = mb;
  a->nb = nb;
  a->b = b;
  //printf("%d] %s : m x n (b) = %d x %d (%d)\n", me, name, mb, nb, b);
  a->blocks = calloc(mb,sizeof(Block*));
  for (ii = 0; ii < mb;ii++) {
    a->blocks[ii] = calloc(nb,sizeof(Block));
    for (jj = 0; jj < nb;jj++) {
      node = get_node(p,q,ii,jj);
      node_coordinates_2i(p,q,node,&row,&col);
      a->blocks[ii][jj].owner = node;
      a->blocks[ii][jj].row = row;
      a->blocks[ii][jj].col = col;
      a->blocks[ii][jj].request = MPI_REQUEST_NULL;
      if (me == a->blocks[ii][jj].owner) {
 	//printf("%d]allocating x_%d,%d\n",me,ii,jj);
 	a->blocks[ii][jj].c = calloc(b*b,sizeof(float));
 	rand_mat(b,b,a->blocks[ii][jj].c,10);
      } else {
 	a->blocks[ii][jj].c = NULL;
      }
    }
  }
  return 0;
 }
 int dsmat_fill_v(Matrix* a, int m, int n, int b, int p, int q, char* name, float value) {
  int me, node;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  int mb = m/b, nb = n/b;
  int ii, jj;
  int row, col;
  a->mb = mb;
  a->nb = nb;
  a->b = b;
  a->blocks = calloc(mb,sizeof(Block*));
  for (ii = 0; ii < mb;ii++) {
    a->blocks[ii] = calloc(nb,sizeof(Block));
    for (jj = 0; jj < nb;jj++) {
      node = get_node(p,q,ii,jj);
      node_coordinates_2i(p,q,node,&row,&col);
      a->blocks[ii][jj].owner = node;
      a->blocks[ii][jj].row = row;
      a->blocks[ii][jj].col = col;
      a->blocks[ii][jj].request = MPI_REQUEST_NULL;
      if (me == a->blocks[ii][jj].owner) {
 	//printf("%d]allocating x_%d,%d to fill with %f\n",me,ii,jj, value);
 	a->blocks[ii][jj].c = calloc(b*b,sizeof(float));
 	val_mat(b,b,a->blocks[ii][jj].c,value);
      } else {
 	a->blocks[ii][jj].c = NULL;
      }
    }
  }
  return 0;
 }
 int dsmat_fill_s(Matrix* a, int m, int n, int b, int p, int q, char* name) {
  int me, node;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  int mb = m/b, nb = n/b;
  int ii, jj;
  int row, col;
  a->mb = mb;
  a->nb = nb;
  a->b = b;
  a->blocks = calloc(mb,sizeof(Block*));
  for (ii = 0; ii < mb;ii++) {
    a->blocks[ii] = calloc(nb,sizeof(Block));
    for (jj = 0; jj < nb;jj++) {
      node = get_node(p,q,ii,jj);
      node_coordinates_2i(p,q,node,&row,&col);
      a->blocks[ii][jj].owner = node;
      a->blocks[ii][jj].row = row;
      a->blocks[ii][jj].col = col;
      a->blocks[ii][jj].request = MPI_REQUEST_NULL;
      if (me == a->blocks[ii][jj].owner) {
 	//printf("%d] s_allocating %s_%d,%d to fill with %f\n",me,name,ii,jj,(float)nb*(ii+1)+(jj+1));
 	a->blocks[ii][jj].c = calloc(b*b,sizeof(float));
 	val_mat(b,b,a->blocks[ii][jj].c,(float) nb*(ii+1)+(jj+1));
      } else {
 	a->blocks[ii][jj].c = NULL;
      }
    }
  }
  return 0;
 }
 int dsmat_destroy(Matrix* a, char* name) {
  int me;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  int mb = a->mb, nb = a->nb;
  //printf("[%d] destroying matrix %s (mb=%d,nb=%d,b=%d)\n",me, name, mb, nb, a->b);
  int ii, jj;
  Block * a_ij;
  for (ii = 0; ii < mb ; ii++) {
    for (jj = 0; jj < nb ; jj++) {
      a_ij = & a->blocks[ii][jj];
      //if (a_ij->c != NULL) { // && a_ij.owner == me) {
      if (a_ij->c != NULL && a_ij->owner == me) {
 	free(a_ij->c);
      }		
    }
    free(a->blocks[ii]);
  }
  free(a->blocks);
  return 0;
 }
 int dsmat_scal_check(Matrix* A, float alpha) {
  int i,j;
  int me;
  if (alpha == 0.0) return 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  Block* Aij;
  for(i = 0; i < A->mb; i++) {
    for(j = 0; j < A->nb; j++) {
      Aij = & A->blocks[i][j];
      if (Aij->owner == me) {
 	double computation_amount = 2.0*A->b*A->b*A->b;
 	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, A->b, A->b, A->b,
 	    0.0, Aij->c, A->b, Aij->c, A->b,
 	    alpha, Aij->c, A->b);
      }	
    }
  }
  return 0;
 }
 int dsmat_scal(Matrix* A, float alpha) {
  int i,j;
  int me;
  if (alpha == 0.0) return 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  Block* Aij;
  SMPI_SAMPLE_LOCAL(i = 0, i < A->mb, i++, 10, 0.005) {
    SMPI_SAMPLE_LOCAL(j = 0, j < A->nb, j++, 10, 0.005) {
      Aij = & A->blocks[i][j];
      if (Aij->owner == me) {
 	double computation_amount = 2.0*A->b*A->b*A->b;
 	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, A->b, A->b, A->b,
 	    0.0, Aij->c, A->b, Aij->c, A->b,
 	    alpha, Aij->c, A->b);
      }	
    }
  }
  return 0;
 }
 // FIXME : remove alpha/beta
 int local_outer_product_check(float alpha, Matrix* A, Matrix* B, Matrix* C, int l, int p, int q) {
  int i, j, err;
  for(i = 0; i < C->mb; i++) {
    for(j = 0; j < C->nb; j++) {
      err = compute_local_op(alpha, A, B, C, i, j, l);
      if (err != 0) return 1;
    }
  }
  /* free useless memory */
  free_local_op(A, B, l, p, q);
  return 0;
 }
 int local_outer_product(float alpha, Matrix* A, Matrix* B, Matrix* C, int l, int p, int q) {
  int i, j, err;
  SMPI_SAMPLE_LOCAL(i = 0, i < C->mb, i++, 10, 0.005) {
    SMPI_SAMPLE_LOCAL(j = 0, j < C->nb, j++, 10, 0.005) {
      err = compute_local_op(alpha, A, B, C, i, j, l);
      if (err != 0) return 1;
    }
  }
  /* free useless memory */
  free_local_op(A, B, l, p, q);
  return 0;
 }
 int compute_local_op(float alpha, Matrix* A, Matrix* B, Matrix* C, int i, int j, int l) {
  int me;
  int b;
  Block *Ail, *Blj, *Cij;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  Cij = & C->blocks[i][j];
  b = C->b;
  if (Cij->owner == me) {
    Ail = & A->blocks[i][l];
    if (Ail->c == NULL) { return 1; }
    Blj = & B->blocks[l][j];
    if (Blj->c == NULL) { return 2; }
 //    TRACE_host_set_state(COMPUTE);
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, b,b,b,
 	alpha, Ail->c, b, Blj->c, b,
 	1.0, Cij->c, b); 
 //    TRACE_host_set_state(IDLE);
  }
  return 0;
 }
 int free_local_op(Matrix* A, Matrix* B, int l, int p, int q) {
  int i,j;
  int me, me_coord[2];
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates(p,q,me,me_coord);
  Block *Ail, *Blj;
  for (i = 0; i < A->mb; i++) {
    Ail = & A->blocks[i][l];
    if (Ail->owner != me && Ail->c != NULL) {
      free(Ail->c);
      Ail->c = NULL;
    }
  }
  for (j = 0; j < B->nb; j++) {
    Blj = & B->blocks[l][j];
    if (Blj->owner != me && Blj->c != NULL) {
      free(Blj->c);
      Blj->c = NULL;
    }
  }
  return 0;
 }
 int block_copy(float * a, float * b, int m, int n) {
  int i, j;
  for (i = 0; i < m ; i++) {
    for (j = 0; j < n ; j++) {
      a[n*i+j] = b[n*i+j];		
    }	
  }	
  return 0;
 }
 int block_print(float * a, int m, int n, char* name) {
  int i, j;
  printf("block %s\n", name);
  for (i = 0; i < m ; i++) {
    for (j = 0; j < n ; j++) {
      printf("%9.2f\t", a[n*i+j]);
    }	
    printf("\n");
  }	
  printf("\n");
  return 0;
 }
 // A <- B
 int dsmat_copy(Matrix * A, Matrix * B) {
  int i, j;
  int me;
  int mb, nb, b;
  Block *Aij, *Bij;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  A->mb = B->mb;
  A->nb = B->nb;
  A->b = B->b;
  mb = A->mb;
  nb = A->nb;
  b = A->b;
  A->blocks = calloc(mb, sizeof(Block*));
  for (i = 0; i<mb;i++){
    A->blocks[i] = calloc(nb, sizeof(Block));
    for (j = 0; j<nb;j++){
      Aij = & A->blocks[i][j];
      Bij = & B->blocks[i][j];
      Aij->owner = Bij->owner;
      Aij->row = Bij->row;
      Aij->col = Bij->col;
      Aij->request = MPI_REQUEST_NULL;
      if (Bij->owner == me) {
        Aij->c = calloc(b*b,sizeof(float));
 	block_copy(Aij->c, Bij->c, b, b);
      }
    }
  }
  return 0;
 }
 int dsmat_copy_to(Matrix * A, Matrix * B, int rcv, char* copy, char* copied) {
  int i, j, l;
  int me,tag;
  int mb, nb, b;
  Block *Aij, *Bij;
  float* localA;
  MPI_Status status;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  A->nb = 1;
  A->mb = 1;
  A->b = -1;
  mb = B->mb;
  nb = B->nb;
  b = B->b;
  tag = 0;
  A->blocks = malloc(sizeof(Block*));
  A->blocks[0] = malloc(sizeof(Block));
  Aij = & A->blocks[0][0];
  Aij->owner = rcv;
  Aij->row = -1;
  Aij->col = -1; // not on a grid ...
  Aij->request = MPI_REQUEST_NULL;
  if (me == rcv) {
    Aij->c = malloc(mb*b*nb*b *sizeof(float));
  }
  for (i = 0; i<mb;i++){
    for (j = 0; j<nb;j++){
      Bij = & B->blocks[i][j];
      if (Bij->owner == me) {
 	if (rcv != me) {
 	  MPI_Send(Bij->c, b*b, MPI_FLOAT, 
 	      rcv, tag,
 	      MPI_COMM_WORLD); 
 	} else {
 	  for (l = 0; l<b; l++) {
 	    block_copy(&Aij->c[nb*i*b*b+j*b+l*nb*b], Bij->c, 1, b);
 	  }
 	}
      } else if (me == rcv) {
        localA = malloc(b*b*sizeof(float));
 	MPI_Recv(localA, b*b, MPI_FLOAT, 
 	    Bij->owner, tag,
 	    MPI_COMM_WORLD,&status); 
 	for (l = 0; l<b; l++) {
 	  block_copy(&Aij->c[nb*i*b*b+j*b+l*nb*b], localA, 1, b);
 	}
        free(localA);
      }
    }
  }
  return 0;
 }
--- a/TP2/src/dsmat.h
+++ b/TP2/src/dsmat.h
@ -0,0 +1,62 @@
 #ifndef DENSE_MAT_FNCT_H
 #define DENSE_MAT_FNCT_H
 typedef struct Blocks {
 	float* c; 	     // The Content of the block stored in an array.
 			     // This pointer is only meaningful to the owner 
 			     // otherwise it is NULL.
 			     // Element x_i,j of a given block of size b
 			     // can be accessed as x->c[b*i+j].
 	int owner;           // The MPI rank of the owner of this block.
 			     // This information is available to all the nodes.
 	int row, col;        // owner = row * q + col in a p x q grid.
 	MPI_Request request; // The Request can be used when sending the block 
 			     // through Immediate Return routines of MPI such as MPI_Irecv
 } Block;
 typedef struct Matrices {
 	int mb, nb, b;       // A given Matrix is of size mb*b x nb*b, b being the
 			     // dimension of every of its square blocks i.e.
 			     // nb is the number of column blocks, mb the one of row blocks -- oof
 	Block** blocks;      // This 2D array describes each block of a given Matrix.
 			     // This is meaningful to all the nodes : information on a block A_i,j
 			     // from a matrix A can be accessed through the block A->blocks[i][j] from every MPI rank.
 } Matrix;
 // tracing
 void init_trace();
 /* dense matrices routines */
 // fill matrix a with values matching the position of the block in the matrix
 // i.e. block a_i,j is full of n*(i+1)+(j+1) with a of size m x n
 int dsmat_fill_s(Matrix* a, int m, int n, int b, int p, int q, char* name);
 // destroy matrix a
 int dsmat_destroy(Matrix* a, char* name);
 // scale matrix a by alpha
 int dsmat_scal_check(Matrix* a, float alpha);
 int dsmat_scal(Matrix* a, float alpha);
 int dsmat_fill_v(Matrix* a, int m, int n, int b, int p, int q, char* name, float value);
 /* dense matrices copy */
 // copy a[0:m-1,0:n-1] into b[0:m-1,0:n-1]
 int block_copy(float * a, float * b, int m, int n);
 // print a[0:m-1,0:n-1]
 int block_print(float * a, int m, int n, char* name);
 // copy matrix B into matrix A
 int dsmat_copy(Matrix * A, Matrix * B);
 // copy matrix B into matrix A owned only by rank rcv
 int dsmat_copy_to(Matrix * A, Matrix * B, int rcv, char* copy, char* copied);
 /* gemm generic routines */
 // computing C += A:l * Bl: for all blocks of C I own using compute_local_op
 // matrices A and B that I do not own are freed from memory using free_local_op
 int local_outer_product_check(float alpha, Matrix* A, Matrix* B, Matrix* C, int l, int p, int q);
 int local_outer_product(float alpha, Matrix* A, Matrix* B, Matrix* C, int l, int p, int q);
 // compute C_i,j += A_i,l * B_l,j
 // if a given block is missing, the corresponding computation is skipped
 int compute_local_op(float alpha, Matrix* A, Matrix* B, Matrix* C, int i, int j, int l);
 // free A:l and Bl: from memory is I do not own them
 int free_local_op(Matrix* A, Matrix* B, int l, int p, int q);
 #endif
--- a/TP2/src/ex1.c
+++ b/TP2/src/ex1.c
@ -0,0 +1,88 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 #include <cblas.h>
 #include "utils.h"
 #include "dsmat.h"
 #include "gemms.h"
 void p2p_transmit_A(int p, int q, Matrix *A, int i, int l)
 {
  int j;
  int me, my_row, my_col;
  MPI_Status status;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates_2i(p, q, me, &my_row, &my_col);
  Block *Ail;
  int node, tag, b;
  tag = 0;
  Ail = &A->blocks[i][l];
  b = A->b;
  /* TODO : transmit A[i,l] using MPI_Ssend & MPI_Recv */
  if (Ail->owner == me)
  { // I own A[i,l]
    /* MPI_Ssend A[i,l] to my row */
    for (j = 0; j < q; j++)
    {
      node = get_node(p, q, my_row, j);
      if (node != me)
      {
        // printf("%d Sending A[%d,%d] to node %d\n", my_rank, i, l, node);
        MPI_Ssend(Ail->c, b * b, MPI_FLOAT, node, tag, MPI_COMM_WORLD);
        // printf("%d Sent A[%d,%d] to node %d\n", my_rank, i, l, node);
      }
    }
  }
  else if (Ail->row == my_row)
  { // A[i,l] is stored on my row
    Ail->c = malloc(b * b * sizeof(float));
    /* MPI_Recv A[i,l] */
    // printf("%d Receiving A[%d,%d] from node %d\n", my_rank, i, l, node);
    MPI_Recv(Ail->c, b * b, MPI_FLOAT, Ail->owner, tag, MPI_COMM_WORLD, &status);
    // printf("%d Received A[%d,%d] from node %d\n", my_rank, i, l, node);
  }
  /* end TODO */
 }
 void p2p_transmit_B(int p, int q, Matrix *B, int l, int j)
 {
  int i;
  int me, my_row, my_col;
  MPI_Status status;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates_2i(p, q, me, &my_row, &my_col);
  int node, tag, b;
  tag = 1;
  Block *Blj;
  Blj = &B->blocks[l][j];
  b = B->b;
  /* TODO : transmit B[l,j] using MPI_Ssend & MPI_Recv */
  if (Blj->owner == me)
  { // I owned B[l,j]
    /* MPI_Ssend B[l,j] to my column */
    for (i = 0; i < p; i++)
    {
      node = get_node(p, q, i, my_col);
      if (node != me)
      {
        // printf("%d Sending B[%d,%d] to node %d\n", me, l, j, node);
        MPI_Ssend(Blj->c, b * b, MPI_FLOAT, node, tag, MPI_COMM_WORLD);
        // printf("%d Sent B[%d,%d] to node %d\n", me, l, j, node);
      }
    }
  }
  else if (Blj->col == my_col)
  { // B[l,j] is stored on my column
    Blj->c = malloc(b * b * sizeof(float));
    /* MPI_Recv B[l,j] */
    // printf("%d Receiving B[%d,%d] from node %d\n", me, l, j, node);
    MPI_Recv(Blj->c, b * b, MPI_FLOAT, Blj->owner, tag, MPI_COMM_WORLD, &status);
    // printf("%d Received B[%d,%d] from node %d\n", me, l, j, node);
  }
  /* end TODO */
 }
--- a/TP2/src/ex1.c.clem
+++ b/TP2/src/ex1.c.clem
@ -0,0 +1,63 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 #include <cblas.h>
 #include "utils.h"
 #include "dsmat.h"
 #include "gemms.h"
 void p2p_transmit_A(int p, int q, Matrix *A, int i, int l) {
        int j;
        int me, my_row, my_col;
        MPI_Status status;
        MPI_Comm_rank(MPI_COMM_WORLD, &me);
        node_coordinates_2i(p,q,me,&my_row,&my_col);
        Block *Ail;
        int node, tag, b;
        Ail =  & A->blocks[i][l];
        b = A->b;
        /* TODO : transmit A[i,l] using MPI_Ssend & MPI_Recv */
        if (Ail->owner == me /* I own A[i,l]*/) {
                /* MPI_Ssend A[i,l] to my row */
                for (j = 0; j < q; j++) {
                        node = get_node(p, q, my_row, j);
                        if (node != me)
                                MPI_Ssend(Ail->c, b*b, MPI_FLOAT, node, 0, MPI_COMM_WORLD);
                }
        } else if (Ail->row == my_row /* A[i,l] is stored on my row */) {
                Ail->c = malloc(b*b*sizeof(float));
                /* MPI_Recv A[i,l] */
                MPI_Recv(Ail->c, b*b, MPI_FLOAT, Ail->owner, 0, MPI_COMM_WORLD, &status);
        }
        /* end TODO */
 }
 void p2p_transmit_B(int p, int q, Matrix *B, int l, int j) {
        int i;
        int me, my_row, my_col;
        MPI_Status status;
        MPI_Comm_rank(MPI_COMM_WORLD, &me);
        node_coordinates_2i(p,q,me,&my_row,&my_col);
        int node, tag, b;
        Block *Blj;
        Blj =  & B->blocks[l][j];
        b = B->b;
        /* TODO : transmit B[l,j] using MPI_Ssend & MPI_Recv */
        if (Blj->owner == me /* I owned B[l,j]*/) {
                /* MPI_Ssend B[l,j] to my column */
                for (i = 0; i < p; i++) {
                        node = get_node(p, q, i, my_col);
                        if (node != me)
                                MPI_Ssend(Blj->c, b*b, MPI_FLOAT, node, 1, MPI_COMM_WORLD);
                }
        } else if (Blj->col == my_col /* B[l,j] is stored on my column */) {
                Blj->c = malloc(b*b*sizeof(float));
                /* MPI_Recv B[l,j] */
                MPI_Recv(Blj->c, b*b, MPI_FLOAT, Blj->owner, 1, MPI_COMM_WORLD, &status);
        }
        /* end TODO */
 }
--- a/TP2/src/ex1.h
+++ b/TP2/src/ex1.h
@ -0,0 +1,5 @@
 #ifndef EXO_1_H
 #define EXO_1_H
 void p2p_transmit_A(int p, int q, Matrix *A, int i, int l);
 void p2p_transmit_B(int p, int q, Matrix *B, int l, int j);
 #endif
--- a/TP2/src/ex2.c
+++ b/TP2/src/ex2.c
@ -0,0 +1,53 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 #include <cblas.h>
 #include "utils.h"
 #include "dsmat.h"
 #include "gemms.h"
 void bcast_A(int p, int q, Matrix *A, int i, int l, MPI_Comm row_comm)
 {
  int me, my_row, my_col;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates_2i(p, q, me, &my_row, &my_col);
  Block *Ail;
  int b = A->b;
  Ail = &A->blocks[i][l];
  /* TODO : transmit A[i,l] using MPI_Bcast */
  if (q > 1 && Ail->row == my_row)
  { /* Ail is stored on my row */
    if (Ail->owner != me)
    {
      Ail->c = calloc(b * b, sizeof(float));
    }
    // MPI_Bcast
    MPI_Bcast(Ail->c, b * b, MPI_FLOAT, Ail->col, row_comm);
  }
  /* end TODO */
 }
 void bcast_B(int p, int q, Matrix *B, int l, int j, MPI_Comm col_comm)
 {
  int me, my_row, my_col;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates_2i(p, q, me, &my_row, &my_col);
  Block *Blj;
  int b = B->b;
  Blj = &B->blocks[l][j];
  /* TODO : transmit B[l,j] using MPI_Bcast */
  if (p > 1 && Blj->col == my_col)
  { /* Blj is stored on my column */
    if (Blj->owner != me)
    {
      Blj->c = calloc(b * b, sizeof(float));
    }
    // MPI_Bcast
    MPI_Bcast(Blj->c, b * b, MPI_FLOAT, Blj->row, col_comm);
  }
  /* end TODO */
 }
--- a/TP2/src/ex2.h
+++ b/TP2/src/ex2.h
@ -0,0 +1,5 @@
 #ifndef EXO_2_H
 #define EXO_2_H
 void bcast_A(int p, int q, Matrix *A, int i, int l, MPI_Comm row_comm);
 void bcast_B(int p, int q, Matrix *B, int l, int j, MPI_Comm col_comm);
 #endif
--- a/TP2/src/ex3.c
+++ b/TP2/src/ex3.c
@ -0,0 +1,108 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 #include <cblas.h>
 #include "utils.h"
 #include "dsmat.h"
 #include "gemms.h"
 void p2p_i_transmit_A(int p, int q, Matrix *A, int i, int l)
 {
  int j, b;
  int me, my_row, my_col;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates_2i(p, q, me, &my_row, &my_col);
  int node, tag;
  tag = 0;
  Block *Ail;
  Ail = &A->blocks[i][l];
  b = A->b;
  /* TODO : transmit A[i,l] using MPI_Isend/recv */
  if (Ail->owner == me)
  {
    // MPI_Isend Ail to my row
    for (j = 0; j < q; j++)
    {
      node = get_node(p, q, my_row, j);
      if (node != me)
      {
        MPI_Isend(Ail->c, b * b, MPI_FLOAT, node, tag, MPI_COMM_WORLD, &Ail->request);
      }
    }
  }
  else if (Ail->row == my_row)
  {
    Ail->c = calloc(b * b, sizeof(float));
    // MPI_Irecv Ail
    MPI_Irecv(Ail->c, b * b, MPI_FLOAT, Ail->owner, tag, MPI_COMM_WORLD, &Ail->request);
  }
  /* end TODO */
 }
 void p2p_i_transmit_B(int p, int q, Matrix *B, int l, int j)
 {
  int i, b;
  int me, my_row, my_col;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates_2i(p, q, me, &my_row, &my_col);
  int node, tag;
  tag = 1;
  Block *Blj;
  Blj = &B->blocks[l][j];
  b = B->b;
  /* TODO : transmit B[l,j] using MPI_Isend/recv */
  if (Blj->owner == me)
  {
    // MPI_Isend Blj to my col
    for (i = 0; i < p; i++)
    {
      node = get_node(p, q, i, my_col);
      if (node != me)
      {
        MPI_Isend(Blj->c, b * b, MPI_FLOAT, node, tag, MPI_COMM_WORLD, &Blj->request);
      }
    }
  }
  else if (Blj->col == my_col)
  {
    Blj->c = calloc(b * b, sizeof(float));
    // MPI_Irecv Blj
    MPI_Irecv(Blj->c, b * b, MPI_FLOAT, Blj->owner, tag, MPI_COMM_WORLD, &Blj->request);
  }
  /* end TODO */
 }
 void p2p_i_wait_AB(int p, int q, Matrix *A, Matrix *B, Matrix *C, int l)
 {
  int me, my_row, my_col;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates_2i(p, q, me, &my_row, &my_col);
  int i, j;
  Block *Ail, *Blj;
  /* TODO : wait for A[i,l] and B[l,j] if I need them */
  for (i = 0; i < A->mb; i++)
  {
    Ail = &A->blocks[i][l];
    if (Ail->owner != me && Ail->row == my_row)
    {
      // MPI_Wait Ail
      MPI_Wait(&Ail->request, MPI_STATUS_IGNORE);
    }
  }
  for (j = 0; j < B->nb; j++)
  {
    Blj = &B->blocks[l][j];
    if (Blj->owner != me && Blj->col == my_col)
    {
      // MPI_Wait Blj
      MPI_Wait(&Blj->request, MPI_STATUS_IGNORE);
    }
  }
  /* Alternative suggestion : iterate over blocks of C */
  /* end TODO */
 }
--- a/TP2/src/ex3.h
+++ b/TP2/src/ex3.h
@ -0,0 +1,6 @@
 #ifndef EXO_3_H
 #define EXO_3_H
 void p2p_i_transmit_A(int p, int q, Matrix *A, int i, int l);
 void p2p_i_transmit_B(int p, int q, Matrix *B, int l, int j);
 void p2p_i_wait_AB(int p, int q, Matrix *A, Matrix* B, Matrix* C,int l);
 #endif
--- a/TP2/src/gemms.c
+++ b/TP2/src/gemms.c
@ -0,0 +1,157 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <mpi.h>
 #include <cblas.h>
 #include "utils.h"
 #include "dsmat.h"
 #include "gemms.h"
 #include "ex1.h"
 #include "ex2.h"
 #include "ex3.h"
 int pgemm_p2p(int check, int p, int q, int m, int n, int k, Matrix* A, Matrix* B, Matrix* C) {
  int mb, nb, kb;
  int i, j, l;
  int me, me_coord[2], my_row, my_col;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates(p,q,me,me_coord);
  node_coordinates_2i(p,q,me,&my_row,&my_col);
  if (A->nb != B->mb || A->mb != C-> mb || B->nb != C->nb) {
    if (me == 0) {
      printf("     A  B  C\n");
      printf(" mb %d %d %d\n", A->mb, B->mb, C->mb);
      printf(" nb %d %d %d\n", A->nb, B->nb, C->nb);
    }
    return 1;
  }
  if (B->b != A->b || A->b != C-> b) return 2;
  mb = C->mb;
  nb = C->nb;
  kb = A->nb;
  for (l = 0; l < kb; l++) {
    for (i = 0; i < mb; i++) {
      p2p_transmit_A(p,q,A,i,l);
    }
    for (j = 0; j < nb; j++) {
      p2p_transmit_B(p,q,B,l,j);
    }
    if (check) {
 	    local_outer_product_check(1.0f, A, B, C, l, p, q);
    } else {
 	    local_outer_product(1.0f, A, B, C, l, p, q);
    } 
  } 
  // printf("FINI\n");
  return 0;
 }
 int pgemm_bcast(int check, int p, int q, int m, int n, int k, Matrix* A, Matrix* B, Matrix* C) {
  int mb, nb, kb;
  int i, j, l;
  int me, me_row_comm, me_col_comm, me_coord[2];
  int my_row, my_col;
  MPI_Comm row_comm, col_comm;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  if (A->nb != B->mb || A->mb != C-> mb || B->nb != C->nb) {
    if (me == 0) {
      printf("     A  B  C\n");
      printf(" mb %d %d %d\n", A->mb, B->mb, C->mb);
      printf(" nb %d %d %d\n", A->nb, B->nb, C->nb);
    }
    return 1;
  }
  if (B->b != A->b || A->b != C-> b) return 2;
  mb = C->mb;
  nb = C->nb;
  kb = A->nb;
  node_coordinates(p,q,me,me_coord);
  node_coordinates_2i(p,q,me,&my_row, &my_col);
  if (q > 1) {
    MPI_Comm_split(MPI_COMM_WORLD, my_row, me, &row_comm);
    MPI_Comm_rank(row_comm, &me_row_comm);
  } else {
    me_row_comm = -1;
  }
  if (p > 1) {
    MPI_Comm_split(MPI_COMM_WORLD, my_col, me, &col_comm);
    MPI_Comm_rank(col_comm, &me_col_comm);
  } else {
    me_col_comm = -1;
  }
  for (l = 0; l < kb ; l++) {
    for (i = 0; i < mb; i++) {
      bcast_A(p,q,A,i,l,row_comm);
    }
    for (j = 0; j < nb; j++) {
      bcast_B(p,q,B,l,j,col_comm);
    }
    if (check) {
 	    local_outer_product_check(1.0f, A, B, C, l, p, q);
    } else {
 	    local_outer_product(1.0f, A, B, C, l, p, q);
    } 
  } 
  if (q > 1)
    MPI_Comm_free(&row_comm);
  if (p > 1)
    MPI_Comm_free(&col_comm);
  return 0;
 }
 int pgemm_p2p_i_la(int check, int p, int q, int lookahead, int m, int n, int k, Matrix* A, Matrix* B, Matrix* C) {
  int mb, nb, kb;
  int i, j, l;
  int me, me_coord[2],my_row, my_col;
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  node_coordinates(p,q,me,me_coord);
  node_coordinates_2i(p,q,me,&my_row,&my_col);
  if (A->nb != B->mb || A->mb != C-> mb || B->nb != C->nb) {
    if (me == 0) {
      printf("     A  B  C\n");
      printf(" mb %d %d %d\n", A->mb, B->mb, C->mb);
      printf(" nb %d %d %d\n", A->nb, B->nb, C->nb);
    }
    return 1;
  }
  if (B->b != A->b || A->b != C-> b) return 2;
  mb = C->mb;
  nb = C->nb;
  kb = A->nb;
  if (lookahead <= 0) return 3;
  if (lookahead >= kb) lookahead = kb;
  //printf("LA = %d, KB = %d\n",lookahead, kb);
  for (l = 0; l < lookahead ; l++) {
    for (i = 0; i < mb; i++) {
      p2p_i_transmit_A(p,q,A,i,l);
    }
    for (j = 0; j < nb; j++) {
      p2p_i_transmit_B(p,q,B,l,j);
    }
  }
  for (l = 0; l < kb ; l++) {
    if (l < kb - lookahead) { // "kb-th" lookahead : kb = l + lookahead
      for (i= 0; i < mb; i++) {
        p2p_i_transmit_A(p,q,A,i,l+lookahead);
      }
      for (j= 0; j < nb; j++) {
        p2p_i_transmit_B(p,q,B,l+lookahead,j);
      }
    }
    p2p_i_wait_AB(p,q,A,B,C,l);
    if (check) {
 	    local_outer_product_check(1.0f, A, B, C, l, p, q);
    } else {
 	    local_outer_product(1.0f, A, B, C, l, p, q);
    } 
  } 
  return 0;
 }
--- a/TP2/src/gemms.h
+++ b/TP2/src/gemms.h
@ -0,0 +1,9 @@
 #ifndef PROGPARALLEL_GEMMS_H
 #define PROGPARALLEL_GEMMS_H
 int pgemm_p2p(int check, int p, int q, int m, int n, int k, Matrix* A, Matrix* B, Matrix* C);
 int pgemm_bcast(int check, int p, int q, int m, int n, int k, Matrix* A, Matrix* B, Matrix* C);
 //int pgemm_p2p_i(int p, int q, int m, int n, int k, Matrix* A, Matrix* B, Matrix* C);
 int pgemm_p2p_i_la(int check, int p, int q, int lookahead, int m, int n, int k, Matrix* A, Matrix* B, Matrix* C);
 #endif
--- a/TP2/src/main.c
+++ b/TP2/src/main.c
@ -0,0 +1,270 @@
 #include <mpi.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <float.h>
 #include <cblas.h>
 #include <time.h>
 #include <argp.h>
 #include "utils.h" 
 #include "dsmat.h"
 #include "gemms.h"
 static char doc[] =
 "TP Prog Parallèle -- Ligne de commande";
 static char args_doc[] = "-m [m] -n [n] -k [k] -b [b] -p [p] -q [q] --algorithm [p2p|p2p-i-la|bcast] --lookahead [la] --niter [i]";
 static struct argp_option options[] = {
  {"m",        'm', "int", 0,  "Number of rows in A and C (deprecated)" },
  {"n",        'n', "int", 0,  "Dimension of A B and C" },
  {"k",        'k', "int", 0,  "Shared dimension of A and B (deprecated)" },
  {"blocking", 'b', "int", 0,  "Size of the square block of A, B and C (must divide m,n and k" },
  {"p",        'p', "int", 0, "Length of the logical grid"},
  {"q",        'q', "int", 0, "Width of the logical grid"},
  {"algorithm",'a', "string", 0, "GEMM distributed algorithm to use"},
  {"lookahead",'l', "int", 0, "Parameter for p2p-i-la algorithm"},
  {"verbose",  'v', 0, 0, "If the program print more"},
  {"checking", 'c', 0, 0, "If the program checks gemm results"},
  {"niter",    'i', "int", 0, "Number of iterations"},
  { 0 }
 };
 struct arguments
 {
  int m, n, k, b;
  int p, q;
  int la;
  char* algo;
  int verbose, check;
  int iter;
 };
  static error_t
 parse_opt (int key, char *arg, struct argp_state *state)
 {
  /* Get the input argument from argp_parse, which we
     know is a pointer to our arguments structure. */
  struct arguments *arguments = state->input;
  switch (key)
  {
    case 'm':
      arguments->m = atoi(arg);
      break;
    case 'n':
      arguments->n = atoi(arg);
      break;
    case 'k':
      arguments->k = atoi(arg);
      break;
    case 'b':
      arguments->b = atoi(arg);
      break;
    case 'p':
      arguments->p = atoi(arg);
      break;
    case 'q':
      arguments->q = atoi(arg);
      break;
    case 'l':
      arguments->la = atoi(arg);
      break;
    case 'a':
      arguments->algo = arg;
      break;
    case 'v':
      arguments->verbose = 1;
      break;
    case 'c':
      arguments->check = 1;
      break;
    case 'i':
      arguments->iter = atoi(arg);
      break;
    default:
      return ARGP_ERR_UNKNOWN;
  }
  return 0;
 }
 static struct argp argp = { options, parse_opt, args_doc, doc };
 // void print_res(Matrix C, char* algo) {
 //   int i,j;
 //   int size, rank;
 //   MPI_Comm_size(MPI_COMM_WORLD, &size);
 //   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 //   char name[100];
 //   for (i=0;i<C.mb;i++) {
 //     for (j=0;j<C.nb;j++) {
 //       sprintf(name,"resC[%d,%d](%s)",i,j,algo);
 //       if (C.blocks[i][j].owner == rank) 
 // 	      block_print(C.blocks[i][j].c, C.b, C.b, name);	
 //     }
 //   }
 // }
 void gflops_gemm(int m, int n, int k, float exec_time, double* gflops) {
  (*gflops) = 2.0*m*n*k/(exec_time*pow(10,9));
 }
 int main(int argc, char* argv[]) {
  struct arguments arguments;
  arguments.m = 20;
  arguments.n = 20;
  arguments.k = 20;
  arguments.b = 10;
  arguments.p = 2;
  arguments.q = 2;
  arguments.algo = "p2p";
  arguments.la = 0;
  arguments.verbose = 0;
  arguments.check = 0;
  arguments.iter = 1;
  int p, q;
  int m,n,k,b;
  int la;
  int err, iter, niter;
  double d_start, d_stop; // on multiple nodes
  clock_t t;		// on one node
  double time_taken, gflops;
  char hostname[1024];
  char * algo;
  int vbose, check;
  argp_parse (&argp, argc, argv, 0, 0, &arguments);
  m = arguments.m;
  n = arguments.n;
  k = arguments.k;
  b = arguments.b;
  p = arguments.p;
  q = arguments.q;
  algo = arguments.algo;
  la = arguments.la;
  vbose = arguments.verbose;
  check = arguments.check;
  niter = arguments.iter;
  if (strcmp(algo,"p2p")*strcmp(algo,"p2p-i-la")*strcmp(algo,"bcast") != 0) {
    printf("Wrong value for algo, only p2p, p2p-i-la and p2p-bcast authorized\n");
    return 1;
  }
  if (b < 0) { printf("Wrong value for B, should be non-negative\n"); return 1; }
  if (m%b != 0) { printf("M should be divisible by B\n"); return 1; }
  if (n%b != 0) { printf("N should be divisible by B\n"); return 1; }
  if (k%b != 0) { printf("K should be divisible by B\n"); return 1; }
  if (niter < 0) { printf("Wrong value for niter, should be non-negative\n"); return 1; }
  get_host_name(hostname,1024);
  init_trace();
  // openblas_set_num_threads(1);
  srand(time(NULL));
  MPI_Init(NULL,NULL);
  int size, rank;
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  if (vbose)
    //printf("I am the %d-th node in a world of size %d\n", rank, size);
    printf("%s is the %d-th node in a world of size %d\n", hostname, rank, size);
  if (p*q != size) {
    printf("bad world size (p*q != size)\n");
    return 1;
  }
  // this initialization could probably get better
  Matrix  A = (Matrix){0}, B = (Matrix){0}, C = (Matrix){0};
  Matrix wA = (Matrix){0},wB = (Matrix){0},wC = (Matrix){0}, bwC = (Matrix){0};
  Matrix bA = (Matrix){0},bB = (Matrix){0},bC = (Matrix){0};
  if (vbose) 
    printf("[%s] m,n,k = %d,%d,%d | b = %d | pxq = %dx%d | la = %d \n",
 	hostname, m,n,k, b, p,q, la);
   // printf("[%d] m,n,k = %d,%d,%d | b = %d | pxq = %dx%d | la = %d \n",
 //	rank, m,n,k, b, p,q, la);
  //err = dsmat_fill_v(&A, m, k, b, p, q, "A", 1.0f);
  //err = dsmat_fill_v(&B, k, n, b, p, q, "B", 1.0f);
  err = dsmat_fill_s(&A, m, k, b, p, q, "A");
  err = dsmat_fill_s(&B, k, n, b, p, q, "B");
  //err = dsmat_fill(&A, m, k, b, p, q, "A");
  //err = dsmat_fill(&B, k, n, b, p, q, "B");
  err = dsmat_fill_v(&C, m, n, b, p, q, "C", 0.0f);
  err = MPI_Barrier(MPI_COMM_WORLD);	
  if (err != MPI_SUCCESS) return 1;
  for (iter = 0; iter < niter; iter++) {
    err = dsmat_copy(&wA,&A);
    err = dsmat_copy(&wB,&B);
    err = dsmat_copy(&wC,&C);
    MPI_Barrier(MPI_COMM_WORLD);	
    d_start = MPI_Wtime();
    if (strcmp(algo,"p2p") == 0) {
      err = pgemm_p2p(check,p,q,m,n,k,&wA,&wB,&wC);
 //    } else if (strcmp(algo,"p2p-i") == 0) {
 //     err = pgemm_p2p_i(p,q,m,n,k,&wA,&wB,&wC);
    } else if (strcmp(algo,"p2p-i-la") == 0) {
      err = pgemm_p2p_i_la(check,p,q,la,m,n,k,&wA,&wB,&wC);
    } else if (strcmp(algo,"bcast") == 0) {
      err = pgemm_bcast(check,p,q,m,n,k,&wA,&wB,&wC);
    }
    MPI_Barrier(MPI_COMM_WORLD);	
    d_stop = MPI_Wtime();
    gflops_gemm(m,n,k, d_stop - d_start, &gflops);
    if (rank == 0) {
    //printf("[%d] (%s) measured_wtime = %fs (la=%d) | %f Gflop/s\n", rank, algo, d_stop - d_start, la, gflops);
      printf("[%s] (%s) measured_wtime = %fs (la=%d) | %f Gflop/s\n", hostname, algo, d_stop - d_start, la, gflops);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    if (check) {
      err = dsmat_copy_to(&bwC,&wC,0,"bwC","wC");
      err = dsmat_copy_to( &bA, &A,0, "bA", "A");
      err = dsmat_copy_to( &bB, &B,0, "bB", "B");
      err = dsmat_copy_to( &bC, &C,0, "bC", "C");
      MPI_Barrier(MPI_COMM_WORLD);
      if (rank == 0) {
 	if (vbose) {
 	  block_print(bwC.blocks[0][0].c, m, n, algo);
 	  block_print( bA.blocks[0][0].c, m, k, "gA");
 	  block_print( bB.blocks[0][0].c, k, n, "gB");
 	  block_print( bC.blocks[0][0].c, m, n, "gC");
 	}
 	t = clock();
 	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,n,k,
 	    1.0f, bA.blocks[0][0].c, k, bB.blocks[0][0].c, n,
 	    0.0f, bC.blocks[0][0].c, n);
 	t = clock() - t;
 	time_taken = ((double)t/CLOCKS_PER_SEC);
 	gflops_gemm(m,n,k, time_taken, &gflops);
 	//printf("[%d] (g) measured_wtime = %fs | %f Gflop/s\n", rank, time_taken, gflops);
 	printf("[%s] (g) measured_wtime = %fs | %f Gflop/s\n", hostname, time_taken, gflops);
 	if (vbose)
 	  block_print(bC.blocks[0][0].c, m, n, "gresC");
 	myblas_sgepxy(-1.0,bC.blocks[0][0].c,bwC.blocks[0][0].c, m,n);
 	float nrm = cblas_snrm2(m*n,bwC.blocks[0][0].c,1);
 	if (nrm < DBL_EPSILON) printf("GEMM is correct (%12.5e)\n",nrm);
 	else printf("algorithm is not GEMM by %12.5e\n", nrm);
      }
      err = MPI_Barrier(MPI_COMM_WORLD);
      err = dsmat_destroy(&bwC,"bwC");
      err = dsmat_destroy( &bA,"bA");
      err = dsmat_destroy( &bB,"bB");
      err = dsmat_destroy( &bC,"bC");
    }
    MPI_Barrier(MPI_COMM_WORLD);
    err = dsmat_destroy(&wA,"wA");
    err = dsmat_destroy(&wB,"wB");
    err = dsmat_destroy(&wC,"wC");
  }
  err = MPI_Barrier(MPI_COMM_WORLD);	
  err = dsmat_destroy(&A,"A");
  err = dsmat_destroy(&B,"B");
  err = dsmat_destroy(&C,"C");
  if (vbose)
    printf("[%s] matrices destroyed (%d) \n", hostname, err);
    //printf("[%d] matrices destroyed (%d) \n", rank, err);
  return MPI_Finalize();
 }
--- a/TP2/src/test.c
+++ b/TP2/src/test.c
@ -0,0 +1,74 @@
 #include <mpi.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <float.h>
 #include <cblas.h>
 #include <time.h>
 #include "utils.h" 
 #include "dsmat.h"
 #include "gemms.h"
 int main(int argc, char* argv[]) {
  int p, q;
  int m,n,k,b;
  int i,j,l,la;
  int err, iter, niter;
  double d_start, d_stop; // on multiple nodes
  clock_t t;		// on one node
  double time_taken, gflops;
  int node,tag;
  long unsigned int total_us;
  char name[100];
  char * algo;
  int vbose, check;
  MPI_Status status;
  m = 2;
  n = 4;
  k = 4;
  b = 2;
  p = 1;
  q = 2;
  // openblas_set_num_threads(1);
  srand(time(NULL));
  MPI_Init(NULL,NULL);
  int world_size, world_rank;
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
  printf("I am the %d-th node in a world of size %d\n", world_rank, world_size);
  if (p*q != world_size) {
    printf("bad world size\n");
    return 1;
  }
  err = MPI_Barrier(MPI_COMM_WORLD);	
  if (err != MPI_SUCCESS) return 1;
  // this initialization could probably get better
  Matrix A  = (Matrix){0},B  = (Matrix){0},C = (Matrix){0};
  Matrix bA = (Matrix){0},bB = (Matrix){0},bC= (Matrix){0};
  Matrix wA = (Matrix){0},wB = (Matrix){0},wC= (Matrix){0}, bwC= (Matrix){0};
    printf("[%d] m,n,k = %d,%d,%d | b = %d | pxq = %dx%d | la = %d | test %f \n",
 	world_rank, m,n,k, b, p,q, la, 1.0f);
  err = dsmat_fill_s(&A, m, k, b, p, q, "A");
  err = MPI_Barrier(MPI_COMM_WORLD);	
  if (err != MPI_SUCCESS) return 1;
  err = dsmat_copy(&wA,&A);
  MPI_Barrier(MPI_COMM_WORLD);	
  err = dsmat_copy_to(&wC,&A,0,"wC","A");
  printf("%d ] dsmat_copy_to.err = %d\n", world_rank, err);
  err = dsmat_destroy(&wA,"wA");
  err = dsmat_copy(&wA,&A);
  err = dsmat_destroy(&wA,"wA");
  err = dsmat_copy(&wA,&A);
  err = dsmat_destroy(&wC,"wC");
  err = dsmat_destroy(&A,"A");
  err = MPI_Barrier(MPI_COMM_WORLD);	
  printf("[%d] matrices destroyed (%d) \n", world_rank, err);
  return MPI_Finalize();
 }
--- a/TP2/src/utils.c
+++ b/TP2/src/utils.c
@ -0,0 +1,85 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <cblas.h>
 //#include <time.h>
 #include <sys/time.h>
 #include "utils.h"
 void val_mat(int m, int n, float* mat, float val) {
 	int i,j;
 	for(i = 0; i<m; i++) {
 		for(j = 0; j<n; j++) {
 			mat[i*n+j] = val;
 		}
 	}
 }
 void rand_mat(int m, int n, float* mat, float max) {
 	int i,j;
 	for(i = 0; i<m; i++)
 		for(j = 0; j<n; j++)
 			mat[i*n+j] = ((float)rand()/RAND_MAX) * max;
 }
 // unused, more as a reminder
 int item_2d(int i, int j, int m, int n) {
 	return i*n + j;
 }
 long unsigned int time_interval(struct timeval start,struct timeval stop) {
 	return abs( (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec );
 }
 void print_gflops(struct timeval stop, struct timeval start, int m, int n, int k, int b, int kernel, char* name, float * c) {
 	long unsigned int total_us = time_interval(start, stop);
 	printf("flops%f\n",2.0*m*n*k);
 	printf("s = %f\n",total_us*pow(10,3));
 	float gflops = fabs( 2.0*m*n*k/(total_us*pow(10,3)) );
 	printf("gflops = %f\n",gflops);
 	float nrm;
 	if (c == NULL) { nrm = -1.0; } else { nrm  = cblas_snrm2(m*n, c, 1); }
 	printf("%s took %lu µs => %f Gflop/s check: %f (block:%d, kernel:%d)\n", name, total_us, gflops, nrm, b, kernel);
 	printf("CSV %d,%d,%d,%d,%d,%s,%ld,%f\n", m,n,k,b,kernel,name,total_us,gflops);
 }
 void print_mat(float* a, int m, int n, char* name) {
 	int i,j;
 	for (i = 0; i < m ; i++) {
 		for (j = 0; j < n ; j++) {
 			printf("%s[%d,%d] = %f,",name,i,j,a[n*i+j]);
 		}
 		printf("\n");
 	}
 	printf("\n");
 }
 // b = alpha*a + b
 void myblas_sgepxy(float alpha, float* a, float* b, int m, int n) {
 	int i;
 	for (i = 0; i < m ; i++) {
 		cblas_saxpy(n,alpha,&a[n*i],1,&b[n*i],1);
 	}
 }
 void node_coordinates(int p, int q, int node, int* coordinates) {
 	// node = q * c[0] + c[1]
 	coordinates[1] = node % q;
 	coordinates[0] = (node - coordinates[1])/q;
 }
 void node_coordinates_2i(int p, int q, int node, int* my_row, int* my_col) {
 	// node = q * my_row + my_col
 	*my_col = node % q;
 	*my_row = (node - *my_col)/q;
 }
 int get_node(int p, int q, int i, int j) {
  return q*(i%p) + (j%q);
 }
 // cf stackoverflow (https://stackoverflow.com/questions/504810/how-do-i-find-the-current-machines-full-hostname-in-c-hostname-and-domain-info)
 void get_host_name(char* hostname, int buffer_size) {
  hostname[buffer_size - 1] = '\0';
  gethostname(hostname, buffer_size - 1);
 }
--- a/TP2/src/utils.h
+++ b/TP2/src/utils.h
@ -0,0 +1,34 @@
 #ifndef PROGPARALLEL_UTILS_H
 #define PROGPARALLEL_UTILS_H
 #define max(a,b) \
 	({ __typeof__ (a) _a = (a); \
 	 __typeof__ (b) _b = (b); \
 	 _a > _b ? _a : _b; })
 // fill the content of mat with values val
 void val_mat(int m, int n, float* mat, float val); 
 // fill the content of mat with random values from 0.0 to max
 void rand_mat(int m, int n, float* mat, float max);
 // b = alpha*a + b
 void myblas_sgepxy(float alpha, float* a, float* b, int m, int n);
 // return the time between start and stop in µs
 long unsigned int time_interval(struct timeval start,struct timeval stop);
 // deprecated
 void print_gflops(struct timeval stop, struct timeval start, int m, int n, int k, int b, int kernel, char* name, float * c);
 // print the content of a[0:m-1,0;n-1] with given name
 void print_mat(float* a, int m, int n, char* name);
 // fill coordinates according to node = q * coordinates[0] + coordinates[1] 
 void node_coordinates(int p, int q, int node, int* coordinates);
 // fill my_row/col according to node = q * my_row + my_col
 void node_coordinates_2i(int p, int q, int node, int* my_row, int* my_col);
 // return the owner node of a block A_i,j on a p x q grid.
 int get_node(int p, int q, int i, int j);
 // return i*n +j;
 int item_2d(int i, int j, int m, int n);
 // get the name of the machine
 void get_host_name(char* hostname, int buffer_size);
 #endif
--- a/TP2/src/who_am_i.c
+++ b/TP2/src/who_am_i.c
@ -0,0 +1,23 @@
 #include <stdio.h>
 #include <mpi.h>
 int main( int argc, char *argv[] ) {
  int rank, size;
  int l;
  char name[MPI_MAX_PROCESSOR_NAME];
  //MPI_Init (&argc, &argv);	/* starts MPI */
  MPI_Init (NULL, NULL);	/* starts MPI */
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);	/* get current process id */
  MPI_Comm_size (MPI_COMM_WORLD, &size);	/* get number of processes */
  MPI_Get_processor_name(name, &l); /* get processor name */
  printf("Hello world from process %d of %d on processor named %s\n", rank, size, name);
  MPI_Finalize();
  return 0;
 }
--- a/TP2/subject_mpi.pdf
+++ b/TP2/subject_mpi.pdf
--- a/TP2/utils.sh
+++ b/TP2/utils.sh
@ -0,0 +1,53 @@
 TOOLS_DIR=/mnt/n7fs/ens/tp_guivarch/opt2021
 SIMGRID_DIR=$TOOLS_DIR/simgrid-3.31
 VITE_DIR=$TOOLS_DIR/vite
 export PATH=${SIMGRID_DIR}/bin:${PATH}
 # for check and bench
 tmp=$HOME/tmp_simgrid
 mkdir -p $tmp
 my_mpirun="$SIMGRID_DIR/bin/smpirun -trace --cfg=smpi/tmpdir:$tmp"
 traces="traces"
 exec=build/bin/main
 generate_hostfile() {
 	N=${1:-4}
 	mkdir -p hostfiles
 	rm -f hostfiles/hostfile.$N.txt
 	for i in $(seq 1 $N); do
 		echo node-${i}.simgrid.org >>hostfiles/hostfile.$N.txt
 	done
 }
 run() {
 	human=${1:-0}
 	mkdir -p $out
 	echo $my_mpirun $mpi_options ${exec:-build/bin/main} -m $m -k $k -n $n -b $b -a $algo -p $p -q $q -i $iter $options
 	$my_mpirun $mpi_options ${exec:-build/bin/main} -m $m -k $k -n $n -b $b -a $algo -p $p -q $q -i $iter $options &>$out/$algo.out
 	echo reading $out/$algo.out
 	correct=$(grep -i "gemm is correct" "$out/$algo.out" | wc -l)
 	trial=$(grep "Gflop/s" $out/$algo.out | grep $algo | wc -l)
 	echo Found $correct correct GEMM out of $trial
 	while read line; do
 		# [0] (p2p) measured_wtime = 0.000058s (la=0) | 0.002195 Gflop/s
 		gflops=$(echo $line | grep -o "| .* Gflop/s" | grep -o "[0-9]\\+.[0-9]\\+")
 		if [ $human -eq 0 ]; then
 			echo "$m,$k,$n,$b,$p,$q,$algo,$la,$gflops"
 		else
 			echo "mxnxk=${m}x${n}x${k},b=$b,p x q = $p x $q | using $algo, (lookahead:$la) => $gflops Gflop/s"
 		fi
 		echo "$m,$k,$n,$b,$p,$q,$algo,$la,$gflops" >>$csv
 	done < <(grep "Gflop/s" $out/$algo.out | grep $algo)
 	if [ $la -gt 0 ]; then
 		algo=$algo-$la
 	fi
 	mkdir -p $traces
 	mv -f smpi_simgrid.trace $traces/$algo.trace
 	echo You can open $traces/$algo.trace with $VITE_DIR/build/bin/vite
 	echo
 }
		`@ -0,0 +1 @@`
							`void cg_par(double A_local, double rhs, int N, int b, float tol);`
		`@ -0,0 +1 @@`
							`void cg_sq(double A, double rhs, int N, double tol);`