import argparse
import os
import random
from collections import defaultdict

import model as models
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR
from torch.utils.data import DataLoader
from tqdm import tqdm
from util.data_util import PartNormalDataset
from util.util import IOStream, compute_overall_iou, to_categorical

classes_str = [
    "aero",
    "bag",
    "cap",
    "car",
    "chair",
    "ear",
    "guitar",
    "knife",
    "lamp",
    "lapt",
    "moto",
    "mug",
    "Pistol",
    "rock",
    "stake",
    "table",
]


def _init_():
    if not os.path.exists("checkpoints"):
        os.makedirs("checkpoints")
    if not os.path.exists("checkpoints/" + args.exp_name):
        os.makedirs("checkpoints/" + args.exp_name)


def weight_init(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.xavier_normal_(m.weight)
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, 0)
    elif isinstance(m, torch.nn.Conv2d):
        torch.nn.init.xavier_normal_(m.weight)
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, 0)
    elif isinstance(m, torch.nn.Conv1d):
        torch.nn.init.xavier_normal_(m.weight)
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, 0)
    elif isinstance(m, torch.nn.BatchNorm2d):
        torch.nn.init.constant_(m.weight, 1)
        torch.nn.init.constant_(m.bias, 0)
    elif isinstance(m, torch.nn.BatchNorm1d):
        torch.nn.init.constant_(m.weight, 1)
        torch.nn.init.constant_(m.bias, 0)


def train(args, io):
    # ============= Model ===================
    num_part = 50
    device = torch.device("cuda" if args.cuda else "cpu")

    model = models.__dict__[args.model](num_part).to(device)
    io.cprint(str(model))

    model.apply(weight_init)
    model = nn.DataParallel(model)
    print("Let's use", torch.cuda.device_count(), "GPUs!")

    """Resume or not"""
    if args.resume:
        state_dict = torch.load(
            "checkpoints/%s/best_insiou_model.pth" % args.exp_name,
            map_location=torch.device("cpu"),
        )["model"]
        for k in state_dict.keys():
            if "module" not in k:
                from collections import OrderedDict

                new_state_dict = OrderedDict()
                for k in state_dict:
                    new_state_dict["module." + k] = state_dict[k]
                state_dict = new_state_dict
            break
        model.load_state_dict(state_dict)

        print("Resume training model...")
        print(torch.load("checkpoints/%s/best_insiou_model.pth" % args.exp_name).keys())
    else:
        print("Training from scratch...")

    # =========== Dataloader =================
    train_data = PartNormalDataset(npoints=2048, split="trainval", normalize=False)
    print("The number of training data is:%d", len(train_data))

    test_data = PartNormalDataset(npoints=2048, split="test", normalize=False)
    print("The number of test data is:%d", len(test_data))

    train_loader = DataLoader(
        train_data,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        drop_last=True,
    )

    test_loader = DataLoader(
        test_data,
        batch_size=args.test_batch_size,
        shuffle=False,
        num_workers=args.workers,
        drop_last=False,
    )

    # ============= Optimizer ================
    if args.use_sgd:
        print("Use SGD")
        opt = optim.SGD(model.parameters(), lr=args.lr * 100, momentum=args.momentum, weight_decay=0)
    else:
        print("Use Adam")
        opt = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

    if args.scheduler == "cos":
        print("Use CosLR")
        scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr if args.use_sgd else args.lr / 100)
    else:
        print("Use StepLR")
        scheduler = StepLR(opt, step_size=args.step, gamma=0.5)

    # ============= Training =================
    best_acc = 0
    best_class_iou = 0
    best_instance_iou = 0
    num_part = 50
    num_classes = 16

    for epoch in range(args.epochs):
        train_epoch(train_loader, model, opt, scheduler, epoch, num_part, num_classes, io)

        test_metrics, total_per_cat_iou = test_epoch(test_loader, model, epoch, num_part, num_classes, io)

        # 1. when get the best accuracy, save the model:
        if test_metrics["accuracy"] > best_acc:
            best_acc = test_metrics["accuracy"]
            io.cprint("Max Acc:%.5f" % best_acc)
            state = {
                "model": model.module.state_dict() if torch.cuda.device_count() > 1 else model.state_dict(),
                "optimizer": opt.state_dict(),
                "epoch": epoch,
                "test_acc": best_acc,
            }
            torch.save(state, "checkpoints/%s/best_acc_model.pth" % args.exp_name)

        # 2. when get the best instance_iou, save the model:
        if test_metrics["shape_avg_iou"] > best_instance_iou:
            best_instance_iou = test_metrics["shape_avg_iou"]
            io.cprint("Max instance iou:%.5f" % best_instance_iou)
            state = {
                "model": model.module.state_dict() if torch.cuda.device_count() > 1 else model.state_dict(),
                "optimizer": opt.state_dict(),
                "epoch": epoch,
                "test_instance_iou": best_instance_iou,
            }
            torch.save(state, "checkpoints/%s/best_insiou_model.pth" % args.exp_name)

        # 3. when get the best class_iou, save the model:
        # first we need to calculate the average per-class iou
        class_iou = 0
        for cat_idx in range(16):
            class_iou += total_per_cat_iou[cat_idx]
        avg_class_iou = class_iou / 16
        if avg_class_iou > best_class_iou:
            best_class_iou = avg_class_iou
            # print the iou of each class:
            for cat_idx in range(16):
                io.cprint(classes_str[cat_idx] + " iou: " + str(total_per_cat_iou[cat_idx]))
            io.cprint("Max class iou:%.5f" % best_class_iou)
            state = {
                "model": model.module.state_dict() if torch.cuda.device_count() > 1 else model.state_dict(),
                "optimizer": opt.state_dict(),
                "epoch": epoch,
                "test_class_iou": best_class_iou,
            }
            torch.save(state, "checkpoints/%s/best_clsiou_model.pth" % args.exp_name)

    # report best acc, ins_iou, cls_iou
    io.cprint("Final Max Acc:%.5f" % best_acc)
    io.cprint("Final Max instance iou:%.5f" % best_instance_iou)
    io.cprint("Final Max class iou:%.5f" % best_class_iou)
    # save last model
    state = {
        "model": model.module.state_dict() if torch.cuda.device_count() > 1 else model.state_dict(),
        "optimizer": opt.state_dict(),
        "epoch": args.epochs - 1,
        "test_iou": best_instance_iou,
    }
    torch.save(state, "checkpoints/%s/model_ep%d.pth" % (args.exp_name, args.epochs))


def train_epoch(train_loader, model, opt, scheduler, epoch, num_part, num_classes, io):
    train_loss = 0.0
    count = 0.0
    accuracy = []
    shape_ious = 0.0
    metrics = defaultdict(lambda: list())
    model.train()

    for _batch_id, (points, label, target, norm_plt) in tqdm(
        enumerate(train_loader),
        total=len(train_loader),
        smoothing=0.9,
    ):
        batch_size, num_point, _ = points.size()
        points, label, target, norm_plt = (
            Variable(points.float()),
            Variable(label.long()),
            Variable(target.long()),
            Variable(norm_plt.float()),
        )
        points = points.transpose(2, 1)
        norm_plt = norm_plt.transpose(2, 1)
        points, label, target, norm_plt = (
            points.cuda(non_blocking=True),
            label.squeeze(1).cuda(non_blocking=True),
            target.cuda(non_blocking=True),
            norm_plt.cuda(non_blocking=True),
        )
        # target: b,n
        seg_pred = model(points, norm_plt, to_categorical(label, num_classes))  # seg_pred: b,n,50
        loss = F.nll_loss(seg_pred.contiguous().view(-1, num_part), target.view(-1, 1)[:, 0])

        # instance iou without considering the class average at each batch_size:
        batch_shapeious = compute_overall_iou(
            seg_pred,
            target,
            num_part,
        )  # list of of current batch_iou:[iou1,iou2,...,iou#b_size]
        # total iou of current batch in each process:
        batch_shapeious = seg_pred.new_tensor(
            [np.sum(batch_shapeious)],
            dtype=torch.float64,
        )  # same device with seg_pred!!!

        # Loss backward
        loss = torch.mean(loss)
        opt.zero_grad()
        loss.backward()
        opt.step()

        # accuracy
        seg_pred = seg_pred.contiguous().view(-1, num_part)  # b*n,50
        target = target.view(-1, 1)[:, 0]  # b*n
        pred_choice = seg_pred.contiguous().data.max(1)[1]  # b*n
        correct = pred_choice.eq(target.contiguous().data).sum()  # torch.int64: total number of correct-predict pts

        # sum
        shape_ious += batch_shapeious.item()  # count the sum of ious in each iteration
        count += batch_size  # count the total number of samples in each iteration
        train_loss += loss.item() * batch_size
        accuracy.append(correct.item() / (batch_size * num_point))  # append the accuracy of each iteration

        # Note: We do not need to calculate per_class iou during training

    if args.scheduler == "cos":
        scheduler.step()
    elif args.scheduler == "step":
        if opt.param_groups[0]["lr"] > 0.9e-5:
            scheduler.step()
        if opt.param_groups[0]["lr"] < 0.9e-5:
            for param_group in opt.param_groups:
                param_group["lr"] = 0.9e-5
    io.cprint("Learning rate: %f" % opt.param_groups[0]["lr"])

    metrics["accuracy"] = np.mean(accuracy)
    metrics["shape_avg_iou"] = shape_ious * 1.0 / count

    outstr = "Train %d, loss: %f, train acc: %f, train ins_iou: %f" % (
        epoch + 1,
        train_loss * 1.0 / count,
        metrics["accuracy"],
        metrics["shape_avg_iou"],
    )
    io.cprint(outstr)


def test_epoch(test_loader, model, epoch, num_part, num_classes, io):
    test_loss = 0.0
    count = 0.0
    accuracy = []
    shape_ious = 0.0
    final_total_per_cat_iou = np.zeros(16).astype(np.float32)
    final_total_per_cat_seen = np.zeros(16).astype(np.int32)
    metrics = defaultdict(lambda: list())
    model.eval()

    # label_size: b, means each sample has one corresponding class
    for _batch_id, (points, label, target, norm_plt) in tqdm(
        enumerate(test_loader),
        total=len(test_loader),
        smoothing=0.9,
    ):
        batch_size, num_point, _ = points.size()
        points, label, target, norm_plt = (
            Variable(points.float()),
            Variable(label.long()),
            Variable(target.long()),
            Variable(norm_plt.float()),
        )
        points = points.transpose(2, 1)
        norm_plt = norm_plt.transpose(2, 1)
        points, label, target, norm_plt = (
            points.cuda(non_blocking=True),
            label.squeeze(1).cuda(non_blocking=True),
            target.cuda(non_blocking=True),
            norm_plt.cuda(non_blocking=True),
        )
        seg_pred = model(points, norm_plt, to_categorical(label, num_classes))  # b,n,50

        # instance iou without considering the class average at each batch_size:
        batch_shapeious = compute_overall_iou(seg_pred, target, num_part)  # [b]
        # per category iou at each batch_size:

        for shape_idx in range(seg_pred.size(0)):  # sample_idx
            cur_gt_label = label[shape_idx]  # label[sample_idx], denotes current sample belongs to which cat
            final_total_per_cat_iou[cur_gt_label] += batch_shapeious[shape_idx]  # add the iou belongs to this cat
            final_total_per_cat_seen[cur_gt_label] += 1  # count the number of this cat is chosen

        # total iou of current batch in each process:
        batch_ious = seg_pred.new_tensor([np.sum(batch_shapeious)], dtype=torch.float64)  # same device with seg_pred!!!

        # prepare seg_pred and target for later calculating loss and acc:
        seg_pred = seg_pred.contiguous().view(-1, num_part)
        target = target.view(-1, 1)[:, 0]
        # Loss
        loss = F.nll_loss(seg_pred.contiguous(), target.contiguous())

        # accuracy:
        pred_choice = seg_pred.data.max(1)[1]  # b*n
        correct = pred_choice.eq(target.data).sum()  # torch.int64: total number of correct-predict pts

        loss = torch.mean(loss)
        shape_ious += batch_ious.item()  # count the sum of ious in each iteration
        count += batch_size  # count the total number of samples in each iteration
        test_loss += loss.item() * batch_size
        accuracy.append(correct.item() / (batch_size * num_point))  # append the accuracy of each iteration

    for cat_idx in range(16):
        if final_total_per_cat_seen[cat_idx] > 0:  # indicating this cat is included during previous iou appending
            final_total_per_cat_iou[cat_idx] = (
                final_total_per_cat_iou[cat_idx] / final_total_per_cat_seen[cat_idx]
            )  # avg class iou across all samples

    metrics["accuracy"] = np.mean(accuracy)
    metrics["shape_avg_iou"] = shape_ious * 1.0 / count

    outstr = "Test %d, loss: %f, test acc: %f  test ins_iou: %f" % (
        epoch + 1,
        test_loss * 1.0 / count,
        metrics["accuracy"],
        metrics["shape_avg_iou"],
    )

    io.cprint(outstr)

    return metrics, final_total_per_cat_iou


def test(args, io):
    # Dataloader
    test_data = PartNormalDataset(npoints=2048, split="test", normalize=False)
    print("The number of test data is:%d", len(test_data))

    test_loader = DataLoader(
        test_data,
        batch_size=args.test_batch_size,
        shuffle=False,
        num_workers=args.workers,
        drop_last=False,
    )

    # Try to load models
    num_part = 50
    device = torch.device("cuda" if args.cuda else "cpu")

    model = models.__dict__[args.model](num_part).to(device)
    io.cprint(str(model))

    from collections import OrderedDict

    state_dict = torch.load(
        f"checkpoints/{args.exp_name}/best_{args.model_type}_model.pth",
        map_location=torch.device("cpu"),
    )["model"]

    new_state_dict = OrderedDict()
    for layer in state_dict:
        new_state_dict[layer.replace("module.", "")] = state_dict[layer]
    model.load_state_dict(new_state_dict)

    model.eval()
    num_part = 50
    num_classes = 16
    metrics = defaultdict(lambda: list())
    hist_acc = []
    shape_ious = []
    total_per_cat_iou = np.zeros(16).astype(np.float32)
    total_per_cat_seen = np.zeros(16).astype(np.int32)

    for _batch_id, (points, label, target, norm_plt) in tqdm(
        enumerate(test_loader),
        total=len(test_loader),
        smoothing=0.9,
    ):
        batch_size, num_point, _ = points.size()
        points, label, target, norm_plt = (
            Variable(points.float()),
            Variable(label.long()),
            Variable(target.long()),
            Variable(norm_plt.float()),
        )
        points = points.transpose(2, 1)
        norm_plt = norm_plt.transpose(2, 1)
        points, label, target, norm_plt = (
            points.cuda(non_blocking=True),
            label.squeeze().cuda(non_blocking=True),
            target.cuda(non_blocking=True),
            norm_plt.cuda(non_blocking=True),
        )

        with torch.no_grad():
            seg_pred = model(points, norm_plt, to_categorical(label, num_classes))  # b,n,50

        # instance iou without considering the class average at each batch_size:
        batch_shapeious = compute_overall_iou(seg_pred, target, num_part)  # [b]
        shape_ious += batch_shapeious  # iou +=, equals to .append

        # per category iou at each batch_size:
        for shape_idx in range(seg_pred.size(0)):  # sample_idx
            cur_gt_label = label[shape_idx]  # label[sample_idx]
            total_per_cat_iou[cur_gt_label] += batch_shapeious[shape_idx]
            total_per_cat_seen[cur_gt_label] += 1

        # accuracy:
        seg_pred = seg_pred.contiguous().view(-1, num_part)
        target = target.view(-1, 1)[:, 0]
        pred_choice = seg_pred.data.max(1)[1]
        correct = pred_choice.eq(target.data).cpu().sum()
        metrics["accuracy"].append(correct.item() / (batch_size * num_point))

    hist_acc += metrics["accuracy"]
    metrics["accuracy"] = np.mean(hist_acc)
    metrics["shape_avg_iou"] = np.mean(shape_ious)
    for cat_idx in range(16):
        if total_per_cat_seen[cat_idx] > 0:
            total_per_cat_iou[cat_idx] = total_per_cat_iou[cat_idx] / total_per_cat_seen[cat_idx]

    # First we need to calculate the iou of each class and the avg class iou:
    class_iou = 0
    for cat_idx in range(16):
        class_iou += total_per_cat_iou[cat_idx]
        io.cprint(classes_str[cat_idx] + " iou: " + str(total_per_cat_iou[cat_idx]))  # print the iou of each class
    avg_class_iou = class_iou / 16
    outstr = "Test :: test acc: {:f}  test class mIOU: {:f}, test instance mIOU: {:f}".format(
        metrics["accuracy"],
        avg_class_iou,
        metrics["shape_avg_iou"],
    )
    io.cprint(outstr)


if __name__ == "__main__":
    # Training settings
    parser = argparse.ArgumentParser(description="3D Shape Part Segmentation")
    parser.add_argument("--model", type=str, default="PointMLP1")
    parser.add_argument("--exp_name", type=str, default="demo1", metavar="N", help="Name of the experiment")
    parser.add_argument("--batch_size", type=int, default=32, metavar="batch_size", help="Size of batch)")
    parser.add_argument("--test_batch_size", type=int, default=32, metavar="batch_size", help="Size of batch)")
    parser.add_argument("--epochs", type=int, default=350, metavar="N", help="number of episode to train")
    parser.add_argument("--use_sgd", type=bool, default=False, help="Use SGD")
    parser.add_argument("--scheduler", type=str, default="step", help="lr scheduler")
    parser.add_argument("--step", type=int, default=40, help="lr decay step")
    parser.add_argument("--lr", type=float, default=0.003, metavar="LR", help="learning rate")
    parser.add_argument("--momentum", type=float, default=0.9, metavar="M", help="SGD momentum (default: 0.9)")
    parser.add_argument("--no_cuda", type=bool, default=False, help="enables CUDA training")
    parser.add_argument("--manual_seed", type=int, metavar="S", help="random seed (default: 1)")
    parser.add_argument("--eval", type=bool, default=False, help="evaluate the model")
    parser.add_argument("--num_points", type=int, default=2048, help="num of points to use")
    parser.add_argument("--workers", type=int, default=12)
    parser.add_argument("--resume", type=bool, default=False, help="Resume training or not")
    parser.add_argument(
        "--model_type",
        type=str,
        default="insiou",
        help="choose to test the best insiou/clsiou/acc model (options: insiou, clsiou, acc)",
    )

    args = parser.parse_args()
    args.exp_name = args.model + "_" + args.exp_name

    _init_()

    if not args.eval:
        io = IOStream("checkpoints/" + args.exp_name + "/%s_train.log" % (args.exp_name))
    else:
        io = IOStream("checkpoints/" + args.exp_name + "/%s_test.log" % (args.exp_name))
    io.cprint(str(args))

    if args.manual_seed is not None:
        random.seed(args.manual_seed)
        np.random.seed(args.manual_seed)
        torch.manual_seed(args.manual_seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()

    if args.cuda:
        io.cprint("Using GPU")
        if args.manual_seed is not None:
            torch.cuda.manual_seed(args.manual_seed)
            torch.cuda.manual_seed_all(args.manual_seed)
    else:
        io.cprint("Using CPU")

    if not args.eval:
        train(args, io)
    else:
        test(args, io)