diff --git a/myloss.py b/dice_loss.py
similarity index 81%
rename from myloss.py
rename to dice_loss.py
index e28a10d..29a287d 100644
--- a/myloss.py
+++ b/dice_loss.py
@@ -1,17 +1,12 @@
-#
-# myloss.py : implementation of the Dice coeff and the associated loss
-#
-
 import torch
 from torch.autograd import Function, Variable
 
-
 class DiceCoeff(Function):
     """Dice coeff for individual examples"""
 
     def forward(self, input, target):
         self.save_for_backward(input, target)
-        self.inter = torch.dot(input, target) + 0.0001
+        self.inter = torch.dot(input.view(-1), target.view(-1)) + 0.0001
         self.union = torch.sum(input) + torch.sum(target) + 0.0001
 
         t = 2 * self.inter.float() / self.union.float()
@@ -35,9 +30,9 @@ class DiceCoeff(Function):
 def dice_coeff(input, target):
     """Dice coeff for batches"""
     if input.is_cuda:
-        s = Variable(torch.FloatTensor(1).cuda().zero_())
+        s = torch.FloatTensor(1).cuda().zero_()
     else:
-        s = Variable(torch.FloatTensor(1).zero_())
+        s = torch.FloatTensor(1).zero_()
 
     for i, c in enumerate(zip(input, target)):
         s = s + DiceCoeff().forward(c[0], c[1])
diff --git a/eval.py b/eval.py
index de56801..944c111 100644
--- a/eval.py
+++ b/eval.py
@@ -1,55 +1,25 @@
-import matplotlib.pyplot as plt
-import numpy as np
 import torch
 import torch.nn.functional as F
-from torch.autograd import Variable
 
-from myloss import dice_coeff
-from utils import dense_crf
+from dice_loss import dice_coeff
 
 
 def eval_net(net, dataset, gpu=False):
+    """Evaluation without the densecrf with the dice coefficient"""
     tot = 0
     for i, b in enumerate(dataset):
-        X = b[0]
-        y = b[1]
+        img = b[0]
+        true_mask = b[1]
 
-        X = torch.FloatTensor(X).unsqueeze(0)
-        y = torch.ByteTensor(y).unsqueeze(0)
+        img = torch.from_numpy(img).unsqueeze(0)
+        true_mask = torch.from_numpy(true_mask).unsqueeze(0)
 
         if gpu:
-            X = Variable(X, volatile=True).cuda()
-            y = Variable(y, volatile=True).cuda()
-        else:
-            X = Variable(X, volatile=True)
-            y = Variable(y, volatile=True)
+            img = img.cuda()
+            true_mask = true_mask.cuda()
 
-        y_pred = net(X)
+        mask_pred = net(img)[0]
+        mask_pred = (F.sigmoid(mask_pred) > 0.5).float()
 
-        y_pred = (F.sigmoid(y_pred) > 0.6).float()
-        # y_pred = F.sigmoid(y_pred).float()
-
-        dice = dice_coeff(y_pred, y.float()).data[0]
-        tot += dice
-
-        if 0:
-            X = X.data.squeeze(0).cpu().numpy()
-            X = np.transpose(X, axes=[1, 2, 0])
-            y = y.data.squeeze(0).cpu().numpy()
-            y_pred = y_pred.data.squeeze(0).squeeze(0).cpu().numpy()
-            print(y_pred.shape)
-
-            fig = plt.figure()
-            ax1 = fig.add_subplot(1, 4, 1)
-            ax1.imshow(X)
-            ax2 = fig.add_subplot(1, 4, 2)
-            ax2.imshow(y)
-            ax3 = fig.add_subplot(1, 4, 3)
-            ax3.imshow((y_pred > 0.5))
-
-            Q = dense_crf(((X * 255).round()).astype(np.uint8), y_pred)
-            ax4 = fig.add_subplot(1, 4, 4)
-            print(Q)
-            ax4.imshow(Q > 0.5)
-            plt.show()
+        tot += dice_coeff(mask_pred, true_mask).item()
     return tot / i
diff --git a/predict.py b/predict.py
index 45d442e..d963b04 100644
--- a/predict.py
+++ b/predict.py
@@ -1,48 +1,64 @@
 import argparse
+import os
 
-import numpy
+import numpy as np
 import torch
 import torch.nn.functional as F
-from torch.autograd import Variable
+
+from PIL import Image
 
 from unet import UNet
-from utils import *
+from utils import resize_and_crop, normalize, split_img_into_squares, hwc_to_chw, merge_masks, dense_crf
+from utils import plot_img_and_mask
+
+def predict_img(net,
+                full_img,
+                scale_factor=0.5,
+                out_threshold=0.5,
+                use_dense_crf=True,
+                use_gpu=False):
+
+    img_height = full_img.size[1]
+    img_width = full_img.size[0]
+
+    img = resize_and_crop(full_img, scale=scale_factor)
+    img = normalize(img)
+
+    left_square, right_square = split_img_into_squares(img)
+
+    left_square = hwc_to_chw(left_square)
+    right_square = hwc_to_chw(right_square)
+
+    X_left = torch.from_numpy(left_square).unsqueeze(0)
+    X_right = torch.from_numpy(right_square).unsqueeze(0)
+
+    if use_gpu:
+        X_left = X_left.cuda()
+        X_right = X_right.cuda()
+
+    with torch.no_grad():
+        output_left = net(X_left)
+        output_right = net(X_right)
+
+        left_probs = F.sigmoid(output_left)
+        right_probs = F.sigmoid(output_right)
+
+        left_probs = F.upsample(left_probs, size=(img_height, img_height))
+        right_probs = F.upsample(right_probs, size=(img_height, img_height))
+
+        left_mask_np = left_probs.squeeze().cpu().numpy()
+        right_mask_np = right_probs.squeeze().cpu().numpy()
+
+    full_mask = merge_masks(left_mask_np, right_mask_np, img_width)
+
+    if use_dense_crf:
+        full_mask = dense_crf(np.array(full_img).astype(np.uint8), full_mask)
+
+    return full_mask > out_threshold
 
 
-def predict_img(net, full_img, gpu=False):
-    img = resize_and_crop(full_img)
 
-    left = get_square(img, 0)
-    right = get_square(img, 1)
-
-    right = normalize(right)
-    left = normalize(left)
-
-    right = np.transpose(right, axes=[2, 0, 1])
-    left = np.transpose(left, axes=[2, 0, 1])
-
-    X_l = torch.FloatTensor(left).unsqueeze(0)
-    X_r = torch.FloatTensor(right).unsqueeze(0)
-
-    if gpu:
-        X_l = Variable(X_l, volatile=True).cuda()
-        X_r = Variable(X_r, volatile=True).cuda()
-    else:
-        X_l = Variable(X_l, volatile=True)
-        X_r = Variable(X_r, volatile=True)
-
-    y_l = F.sigmoid(net(X_l))
-    y_r = F.sigmoid(net(X_r))
-    y_l = F.upsample_bilinear(y_l, scale_factor=2).data[0][0].cpu().numpy()
-    y_r = F.upsample_bilinear(y_r, scale_factor=2).data[0][0].cpu().numpy()
-
-    y = merge_masks(y_l, y_r, full_img.size[0])
-    yy = dense_crf(np.array(full_img).astype(np.uint8), y)
-
-    return yy > 0.5
-
-
-if __name__ == "__main__":
+def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', '-m', default='MODEL.pth',
                         metavar='FILE',
@@ -61,19 +77,22 @@ if __name__ == "__main__":
     parser.add_argument('--no-save', '-n', action='store_false',
                         help="Do not save the output masks",
                         default=False)
+    parser.add_argument('--no-crf', '-r', action='store_false',
+                        help="Do not use dense CRF postprocessing",
+                        default=False)
+    parser.add_argument('--mask-threshold', '-t', type=float,
+                        help="Minimum probability value to consider a mask pixel white",
+                        default=0.5)
+    parser.add_argument('--scale', '-s', type=float,
+                        help="Scale factor for the input images",
+                        default=0.5)
 
-    args = parser.parse_args()
-    print("Using model file : {}".format(args.model))
-    net = UNet(3, 1)
-    if not args.cpu:
-        print("Using CUDA version of the net, prepare your GPU !")
-        net.cuda()
-    else:
-        net.cpu()
-        print("Using CPU version of the net, this may be very slow")
+    return parser.parse_args()
 
+def get_output_filenames(args):
     in_files = args.input
     out_files = []
+
     if not args.output:
         for f in in_files:
             pathsplit = os.path.splitext(f)
@@ -84,32 +103,52 @@ if __name__ == "__main__":
     else:
         out_files = args.output
 
-    print("Loading model ...")
-    net.load_state_dict(torch.load(args.model))
+    return out_files
+
+def mask_to_image(mask):
+    return Image.fromarray((mask * 255).astype(np.uint8))
+
+if __name__ == "__main__":
+    args = get_args()
+    in_files = args.input
+    out_files = get_output_filenames(args)
+
+    net = UNet(n_channels=3, n_classes=1)
+
+    print("Loading model {}".format(args.model))
+
+    if not args.cpu:
+        print("Using CUDA version of the net, prepare your GPU !")
+        net.cuda()
+        net.load_state_dict(torch.load(args.model))
+    else:
+        net.cpu()
+        net.load_state_dict(torch.load(args.model, map_location='cpu'))
+        print("Using CPU version of the net, this may be very slow")
+
     print("Model loaded !")
 
     for i, fn in enumerate(in_files):
         print("\nPredicting image {} ...".format(fn))
+
         img = Image.open(fn)
-        out = predict_img(net, img, not args.cpu)
+        if img.size[0] < img.size[1]:
+            print("Error: image height larger than the width")
+
+        mask = predict_img(net=net,
+                           full_img=img,
+                           scale_factor=args.scale,
+                           out_threshold=args.mask_threshold,
+                           use_dense_crf= not args.no_crf,
+                           use_gpu=not args.cpu)
 
         if args.viz:
-            print("Vizualising results for image {}, close to continue ..."
-                  .format(fn))
-
-            fig = plt.figure()
-            a = fig.add_subplot(1, 2, 1)
-            a.set_title('Input image')
-            plt.imshow(img)
-
-            b = fig.add_subplot(1, 2, 2)
-            b.set_title('Output mask')
-            plt.imshow(out)
-
-            plt.show()
+            print("Visualizing results for image {}, close to continue ...".format(fn))
+            plot_img_and_mask(img, mask)
 
         if not args.no_save:
             out_fn = out_files[i]
-            result = Image.fromarray((out * 255).astype(numpy.uint8))
+            result = mask_to_image(mask)
             result.save(out_files[i])
+
             print("Mask saved to {}".format(out_files[i]))
diff --git a/submit.py b/submit.py
index 93d197e..12f26cf 100644
--- a/submit.py
+++ b/submit.py
@@ -1,10 +1,15 @@
-# used to predict all test images and encode results in a csv file
+import os
+from PIL import Image
 
-from predict import *
+import torch
+
+from predict import predict_img
+from utils import rle_encode
 from unet import UNet
 
 
 def submit(net, gpu=False):
+    """Used for Kaggle submission: predicts and encode all test images"""
     dir = 'data/test/'
 
     N = len(list(os.listdir(dir)))
diff --git a/train.py b/train.py
index c332657..3b0e4d6 100644
--- a/train.py
+++ b/train.py
@@ -1,20 +1,27 @@
 import sys
+import os
 from optparse import OptionParser
+import numpy as np
 
 import torch
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import optim
-from torch.autograd import Variable
 
 from eval import eval_net
 from unet import UNet
-from utils import *
+from utils import get_ids, split_ids, split_train_val, get_imgs_and_masks, batch
 
+def train_net(net,
+              epochs=5,
+              batch_size=1,
+              lr=0.1,
+              val_percent=0.05,
+              save_cp=True,
+              gpu=False,
+              img_scale=0.5):
 
-def train_net(net, epochs=5, batch_size=2, lr=0.1, val_percent=0.05,
-              cp=True, gpu=False):
     dir_img = 'data/train/'
     dir_mask = 'data/train_masks/'
     dir_checkpoint = 'checkpoints/'
@@ -34,69 +41,66 @@ def train_net(net, epochs=5, batch_size=2, lr=0.1, val_percent=0.05,
         Checkpoints: {}
         CUDA: {}
     '''.format(epochs, batch_size, lr, len(iddataset['train']),
-               len(iddataset['val']), str(cp), str(gpu)))
+               len(iddataset['val']), str(save_cp), str(gpu)))
 
     N_train = len(iddataset['train'])
 
     optimizer = optim.SGD(net.parameters(),
-                          lr=lr, momentum=0.9, weight_decay=0.0005)
+                          lr=lr,
+                          momentum=0.9,
+                          weight_decay=0.0005)
+
     criterion = nn.BCELoss()
 
     for epoch in range(epochs):
         print('Starting epoch {}/{}.'.format(epoch + 1, epochs))
 
         # reset the generators
-        train = get_imgs_and_masks(iddataset['train'], dir_img, dir_mask)
-        val = get_imgs_and_masks(iddataset['val'], dir_img, dir_mask)
+        train = get_imgs_and_masks(iddataset['train'], dir_img, dir_mask, img_scale)
+        val = get_imgs_and_masks(iddataset['val'], dir_img, dir_mask, img_scale)
 
         epoch_loss = 0
 
+        for i, b in enumerate(batch(train, batch_size)):
+            imgs = np.array([i[0] for i in b]).astype(np.float32)
+            true_masks = np.array([i[1] for i in b])
+
+            imgs = torch.from_numpy(imgs)
+            true_masks = torch.from_numpy(true_masks)
+
+            if gpu:
+                imgs = imgs.cuda()
+                true_masks = true_masks.cuda()
+
+            masks_pred = net(imgs)
+            masks_probs = F.sigmoid(masks_pred)
+            masks_probs_flat = masks_probs.view(-1)
+
+            true_masks_flat = true_masks.view(-1)
+
+            loss = criterion(masks_probs_flat, true_masks_flat)
+            epoch_loss += loss.item()
+
+            print('{0:.4f} --- loss: {1:.6f}'.format(i * batch_size / N_train, loss.item()))
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        print('Epoch finished ! Loss: {}'.format(epoch_loss / i))
+
         if 1:
             val_dice = eval_net(net, val, gpu)
             print('Validation Dice Coeff: {}'.format(val_dice))
 
-        for i, b in enumerate(batch(train, batch_size)):
-            X = np.array([i[0] for i in b])
-            y = np.array([i[1] for i in b])
-
-            X = torch.FloatTensor(X)
-            y = torch.ByteTensor(y)
-
-            if gpu:
-                X = Variable(X).cuda()
-                y = Variable(y).cuda()
-            else:
-                X = Variable(X)
-                y = Variable(y)
-
-            y_pred = net(X)
-            probs = F.sigmoid(y_pred)
-            probs_flat = probs.view(-1)
-
-            y_flat = y.view(-1)
-
-            loss = criterion(probs_flat, y_flat.float())
-            epoch_loss += loss.data[0]
-
-            print('{0:.4f} --- loss: {1:.6f}'.format(i * batch_size / N_train,
-                                                     loss.data[0]))
-
-            optimizer.zero_grad()
-
-            loss.backward()
-
-            optimizer.step()
-
-        print('Epoch finished ! Loss: {}'.format(epoch_loss / i))
-
-        if cp:
+        if save_cp:
             torch.save(net.state_dict(),
                        dir_checkpoint + 'CP{}.pth'.format(epoch + 1))
-
             print('Checkpoint {} saved !'.format(epoch + 1))
 
 
-if __name__ == '__main__':
+
+def get_args():
     parser = OptionParser()
     parser.add_option('-e', '--epochs', dest='epochs', default=5, type='int',
                       help='number of epochs')
@@ -108,22 +112,32 @@ if __name__ == '__main__':
                       default=False, help='use cuda')
     parser.add_option('-c', '--load', dest='load',
                       default=False, help='load file model')
+    parser.add_option('-s', '--scale', dest='scale', type='float',
+                      default=0.5, help='downscaling factor of the images')
 
     (options, args) = parser.parse_args()
+    return options
 
-    net = UNet(3, 1)
+if __name__ == '__main__':
+    args = get_args()
 
-    if options.load:
-        net.load_state_dict(torch.load(options.load))
-        print('Model loaded from {}'.format(options.load))
+    net = UNet(n_channels=3, n_classes=1)
 
-    if options.gpu:
+    if args.load:
+        net.load_state_dict(torch.load(args.load))
+        print('Model loaded from {}'.format(args.load))
+
+    if args.gpu:
         net.cuda()
-        cudnn.benchmark = True
+        # cudnn.benchmark = True # faster convolutions, but more memory
 
     try:
-        train_net(net, options.epochs, options.batchsize, options.lr,
-                  gpu=options.gpu)
+        train_net(net=net,
+                  epochs=args.epochs,
+                  batch_size=args.batchsize,
+                  lr=args.lr,
+                  gpu=args.gpu,
+                  img_scale=args.scale)
     except KeyboardInterrupt:
         torch.save(net.state_dict(), 'INTERRUPTED.pth')
         print('Saved interrupt')
diff --git a/unet/unet_model.py b/unet/unet_model.py
index 4afb8dd..a09ee5b 100644
--- a/unet/unet_model.py
+++ b/unet/unet_model.py
@@ -1,14 +1,7 @@
-#!/usr/bin/python
 # full assembly of the sub-parts to form the complete net
 
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-# python 3 confusing imports :(
 from .unet_parts import *
 
-
 class UNet(nn.Module):
     def __init__(self, n_channels, n_classes):
         super(UNet, self).__init__()
diff --git a/unet/unet_parts.py b/unet/unet_parts.py
index c7128d0..7fcadc7 100644
--- a/unet/unet_parts.py
+++ b/unet/unet_parts.py
@@ -1,5 +1,3 @@
-#!/usr/bin/python
-
 # sub-parts of the U-Net model
 
 import torch
@@ -53,9 +51,9 @@ class up(nn.Module):
         super(up, self).__init__()
 
         #  would be a nice idea if the upsampling could be learned too,
-        #  but my machine do not have enough memory to handle all those weights
+        #  but my machine do not have enough memory to handle all those weights
         if bilinear:
-            self.up = nn.UpsamplingBilinear2d(scale_factor=2)
+            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
         else:
             self.up = nn.ConvTranspose2d(in_ch//2, in_ch//2, 2, stride=2)
 
diff --git a/utils/crf.py b/utils/crf.py
index 5ee718f..8a79953 100644
--- a/utils/crf.py
+++ b/utils/crf.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pydensecrf.densecrf as dcrf
 
-
 def dense_crf(img, output_probs):
     h = output_probs.shape[0]
     w = output_probs.shape[1]
diff --git a/utils/data_vis.py b/utils/data_vis.py
index 365e4d1..4ec2f60 100644
--- a/utils/data_vis.py
+++ b/utils/data_vis.py
@@ -1,13 +1,12 @@
 import matplotlib.pyplot as plt
 
-
-def plot_img_mask(img, mask):
+def plot_img_and_mask(img, mask):
     fig = plt.figure()
+    a = fig.add_subplot(1, 2, 1)
+    a.set_title('Input image')
+    plt.imshow(img)
 
-    ax1 = fig.add_subplot(1, 3, 1)
-    ax1.imshow(img)
-
-    ax2 = fig.add_subplot(1, 3, 2)
-    ax2.imshow(mask)
-
-    plt.show()
+    b = fig.add_subplot(1, 2, 2)
+    b.set_title('Output mask')
+    plt.imshow(mask)
+    plt.show()
\ No newline at end of file
diff --git a/utils/load.py b/utils/load.py
index 5ab7f80..8317ffc 100644
--- a/utils/load.py
+++ b/utils/load.py
@@ -3,12 +3,11 @@
 #           cropped images and masks
 
 import os
-from functools import partial
 
 import numpy as np
 from PIL import Image
 
-from .utils import resize_and_crop, get_square, normalize
+from .utils import resize_and_crop, get_square, normalize, hwc_to_chw
 
 
 def get_ids(dir):
@@ -21,23 +20,22 @@ def split_ids(ids, n=2):
     return ((id, i) for i in range(n) for id in ids)
 
 
-def to_cropped_imgs(ids, dir, suffix):
+def to_cropped_imgs(ids, dir, suffix, scale):
     """From a list of tuples, returns the correct cropped img"""
     for id, pos in ids:
-        im = resize_and_crop(Image.open(dir + id + suffix))
+        im = resize_and_crop(Image.open(dir + id + suffix), scale=scale)
         yield get_square(im, pos)
 
-
-def get_imgs_and_masks(ids, dir_img, dir_mask):
+def get_imgs_and_masks(ids, dir_img, dir_mask, scale):
     """Return all the couples (img, mask)"""
 
-    imgs = to_cropped_imgs(ids, dir_img, '.jpg')
+    imgs = to_cropped_imgs(ids, dir_img, '.jpg', scale)
 
     # need to transform from HWC to CHW
-    imgs_switched = map(partial(np.transpose, axes=[2, 0, 1]), imgs)
+    imgs_switched = map(hwc_to_chw, imgs)
     imgs_normalized = map(normalize, imgs_switched)
 
-    masks = to_cropped_imgs(ids, dir_mask, '_mask.gif')
+    masks = to_cropped_imgs(ids, dir_mask, '_mask.gif', scale)
 
     return zip(imgs_normalized, masks)
 
diff --git a/utils/utils.py b/utils/utils.py
index 9b26506..830f1ce 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -1,17 +1,20 @@
 import random
-
 import numpy as np
 
 
 def get_square(img, pos):
-    """Extract a left or a right square from PILimg shape : (H, W, C))"""
-    img = np.array(img)
+    """Extract a left or a right square from ndarray shape : (H, W, C))"""
     h = img.shape[0]
     if pos == 0:
         return img[:, :h]
     else:
         return img[:, -h:]
 
+def split_img_into_squares(img):
+    return get_square(img, 0), get_square(img, 1)
+
+def hwc_to_chw(img):
+    return np.transpose(img, axes=[2, 0, 1])
 
 def resize_and_crop(pilimg, scale=0.5, final_height=None):
     w = pilimg.size[0]
@@ -26,8 +29,7 @@ def resize_and_crop(pilimg, scale=0.5, final_height=None):
 
     img = pilimg.resize((newW, newH))
     img = img.crop((0, diff // 2, newW, newH - diff // 2))
-    return img
-
+    return np.array(img, dtype=np.float32)
 
 def batch(iterable, batch_size):
     """Yields lists by batch"""
@@ -41,7 +43,6 @@ def batch(iterable, batch_size):
     if len(b) > 0:
         yield b
 
-
 def split_train_val(dataset, val_percent=0.05):
     dataset = list(dataset)
     length = len(dataset)
@@ -53,18 +54,17 @@ def split_train_val(dataset, val_percent=0.05):
 def normalize(x):
     return x / 255
 
-
 def merge_masks(img1, img2, full_w):
     h = img1.shape[0]
 
     new = np.zeros((h, full_w), np.float32)
-
     new[:, :full_w // 2 + 1] = img1[:, :full_w // 2 + 1]
     new[:, full_w // 2 + 1:] = img2[:, -(full_w // 2 - 1):]
 
     return new
 
 
+# credits to https://stackoverflow.com/users/6076729/manuel-lagunas
 def rle_encode(mask_image):
     pixels = mask_image.flatten()
     # We avoid issues with '1' at the start or end (at the corners of