import time

import matplotlib.pyplot as plt
import numpy as np
from rich import print
from rich.console import Console

console = Console()

REWARD_MOVE = -0.04
REWARD_GOAL = 1
REWARD_DEATH = -1
DISCOUNT_FACTOR = 0.9

ARROWS = ["←", "↑", "↓", "→"] 

MOVEMENTS = np.array([
    [0.7, 0.1, 0.1, 0.1],
    [0.1, 0.7, 0.1, 0.1],
    [0.1, 0.1, 0.7, 0.1],
    [0.1, 0.1, 0.1, 0.7]
])

REWARDS = np.array([
    REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_GOAL,
    REWARD_MOVE,              REWARD_MOVE, REWARD_DEATH,
    REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_MOVE,
])

# Part 1: Performance Prediction
console.rule("[bold white]Part 1: Performance Prediction")

# Exo 1: Random policy
console.rule("[bold yellow]Exo 1: Random policy")

P = np.array([
    [  
        0.8, 0.1, 0.0, 0.0,
        0.1,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.7, 0.2, 0.1, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.7, 0.1, 0.1,
        0.0,      0.1, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.1, 0.0, 0.0, 0.0,
        0.8,      0.0, 0.0,
        0.1, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.1, 0.0,
        0.0,      0.7, 0.1,
        0.0, 0.0, 0.1, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.1,      0.0, 0.0,
        0.8, 0.1, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.1, 0.8, 0.1, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.7, 0.0,
        0.0, 0.1, 0.1, 0.1,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.1,
        0.0, 0.0, 0.7, 0.2,
    ],
])

V = np.linalg.inv(np.eye(11) - DISCOUNT_FACTOR * P) @ REWARDS.flatten()
print(V)

V_new = REWARDS + DISCOUNT_FACTOR * P @ V

if np.allclose(V, V_new):
    print("[green]V is a fixed point !")
else:
    print("[red]You suck !")

# Exo 2: Iterative Policy Evaluation
console.rule("[bold yellow]Exo 2: Iterative Policy Evaluation")

DELTA = 1e-6

V_random = np.random.rand(11)

while True:
    V_new = REWARDS + DISCOUNT_FACTOR * P @ V_random

    diff = np.max(np.abs(V_new - V_random))
    V_random = V_new

    if diff < DELTA:
        break

print(V_random)

# Exo 3: Bellman operator contraction
console.rule("[bold yellow]Exo 3: Bellman operator contraction")

V1 = np.random.rand(11)
V2 = np.random.rand(11)
diffs = []

while True:
    V1 = REWARDS + DISCOUNT_FACTOR * P @ V1
    V2 = REWARDS + DISCOUNT_FACTOR * P @ V2
    
    diff = np.max(np.abs(V1 - V2))
    diffs.append(diff)
    
    if diff < DELTA:
        break

print(V1)
print(V2)

plt.plot(diffs)
plt.yscale("log")
plt.title("Convergence of Bellman operator")
plt.xlabel("Iteration")
plt.ylabel("Difference")
# plt.show()

# Part 2: Optimization
console.rule("[bold white]Part 2: Optimization")

# Exo 1: Bellmann equation
console.rule("[bold yellow]Exo 1: Bellmann equation")
# cf cours, partie 2, slide 15

# Exo 2: Value Iteration Algorithm
console.rule("[bold yellow]Exo 2: Value Iteration Algorithm")

P_g = np.array([
    [
        0.8, 0.1, 0.0, 0.0,
        0.1,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.7, 0.2, 0.1, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.7, 0.1, 0.1,
        0.0,      0.1, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.1, 0.0, 0.0, 0.0,
        0.8,      0.0, 0.0,
        0.1, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.1, 0.0,
        0.0,      0.7, 0.1,
        0.0, 0.0, 0.1, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.1,      0.0, 0.0,
        0.8, 0.1, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.7, 0.2, 0.1, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.1, 0.0,
        0.0, 0.7, 0.1, 0.1,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.1,
        0.0, 0.0, 0.7, 0.2,
    ],
])

P_h = np.array([
    [
        0.8, 0.1, 0.0, 0.0,
        0.1,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.1, 0.8, 0.1, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.1, 0.7, 0.1,
        0.0,      0.1, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.7, 0.0, 0.0, 0.0,
        0.2,      0.0, 0.0,
        0.1, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.7, 0.0,
        0.0,      0.1, 0.1,
        0.0, 0.0, 0.1, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.7,      0.0, 0.0,
        0.2, 0.1, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.1, 0.8, 0.1, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.7, 0.0,
        0.0, 0.1, 0.1, 0.1,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.7,
        0.0, 0.0, 0.1, 0.2,
    ],
])

P_b = np.array([
    [
        0.2, 0.1, 0.0, 0.0,
        0.7,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.1, 0.8, 0.1, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.1, 0.1, 0.1,
        0.0,      0.7, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.1, 0.0, 0.0, 0.0,
        0.2,      0.0, 0.0,
        0.7, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.1, 0.0,
        0.0,      0.1, 0.1,
        0.0, 0.0, 0.7, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.1,      0.0, 0.0,
        0.8, 0.1, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.1, 0.8, 0.1, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.1, 0.0,
        0.0, 0.1, 0.7, 0.1,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.1,
        0.0, 0.0, 0.1, 0.8,
    ],
])

P_d = np.array([
    [
        0.2, 0.7, 0.0, 0.0,
        0.1,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.1, 0.2, 0.7, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.1, 0.1, 0.7,
        0.0,      0.1, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.1, 0.0, 0.0, 0.0,
        0.8,      0.0, 0.0,
        0.1, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.1, 0.0,
        0.0,      0.1, 0.7,
        0.0, 0.0, 0.1, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.1,      0.0, 0.0,
        0.2, 0.7, 0.0, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.0,
        0.1, 0.2, 0.7, 0.0,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.1, 0.0,
        0.0, 0.1, 0.1, 0.7,
    ],
    [
        0.0, 0.0, 0.0, 0.0,
        0.0,      0.0, 0.1,
        0.0, 0.0, 0.1, 0.8,
    ],
])

V_optimal = np.random.rand(11)
pi = np.zeros(11)

while True:
    V_g = REWARDS + DISCOUNT_FACTOR * P_g @ V_optimal
    V_h = REWARDS + DISCOUNT_FACTOR * P_h @ V_optimal
    V_b = REWARDS + DISCOUNT_FACTOR * P_b @ V_optimal
    V_d = REWARDS + DISCOUNT_FACTOR * P_d @ V_optimal

    V_new = np.max([V_g, V_h, V_b, V_d], axis=0)

    if np.allclose(V_new, V_optimal, atol=1e-6):
        pi = np.argmax([V_g, V_h, V_b, V_d], axis=0)
        break
    else:
        V_optimal = V_new

print(V_optimal)

pi_pretty = [ARROWS[i] for i in pi]
pi_pretty.insert(5, "■")
pi_pretty[3] = "✓"
pi_pretty[7] = "☠"
pi_pretty = np.array(pi_pretty).reshape(3, 4)
print(pi_pretty)

# Exo 4: Performance comparison
console.rule("[bold yellow]Exo 4: Performance comparison")

perf = np.abs(V_optimal - V_random)
print(perf)