421 lines
8.2 KiB
Python
421 lines
8.2 KiB
Python
import time
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from rich import print
|
|
from rich.console import Console
|
|
|
|
console = Console()
|
|
|
|
REWARD_MOVE = -0.04
|
|
REWARD_GOAL = 1
|
|
REWARD_DEATH = -1
|
|
DISCOUNT_FACTOR = 0.9
|
|
|
|
ARROWS = ["←", "↑", "↓", "→"]
|
|
|
|
MOVEMENTS = np.array([
|
|
[0.7, 0.1, 0.1, 0.1],
|
|
[0.1, 0.7, 0.1, 0.1],
|
|
[0.1, 0.1, 0.7, 0.1],
|
|
[0.1, 0.1, 0.1, 0.7]
|
|
])
|
|
|
|
REWARDS = np.array([
|
|
REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_GOAL,
|
|
REWARD_MOVE, REWARD_MOVE, REWARD_DEATH,
|
|
REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_MOVE,
|
|
])
|
|
|
|
# Part 1: Performance Prediction
|
|
console.rule("[bold white]Part 1: Performance Prediction")
|
|
|
|
# Exo 1: Random policy
|
|
console.rule("[bold yellow]Exo 1: Random policy")
|
|
|
|
P = np.array([
|
|
[
|
|
0.8, 0.1, 0.0, 0.0,
|
|
0.1, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.7, 0.2, 0.1, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.7, 0.1, 0.1,
|
|
0.0, 0.1, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.1, 0.0, 0.0, 0.0,
|
|
0.8, 0.0, 0.0,
|
|
0.1, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.1, 0.0,
|
|
0.0, 0.7, 0.1,
|
|
0.0, 0.0, 0.1, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.1, 0.0, 0.0,
|
|
0.8, 0.1, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.1, 0.8, 0.1, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.7, 0.0,
|
|
0.0, 0.1, 0.1, 0.1,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.1,
|
|
0.0, 0.0, 0.7, 0.2,
|
|
],
|
|
])
|
|
|
|
V = np.linalg.inv(np.eye(11) - DISCOUNT_FACTOR * P) @ REWARDS.flatten()
|
|
print(V)
|
|
|
|
V_new = REWARDS + DISCOUNT_FACTOR * P @ V
|
|
|
|
if np.allclose(V, V_new):
|
|
print("[green]V is a fixed point !")
|
|
else:
|
|
print("[red]You suck !")
|
|
|
|
# Exo 2: Iterative Policy Evaluation
|
|
console.rule("[bold yellow]Exo 2: Iterative Policy Evaluation")
|
|
|
|
DELTA = 1e-6
|
|
|
|
V_random = np.random.rand(11)
|
|
|
|
while True:
|
|
V_new = REWARDS + DISCOUNT_FACTOR * P @ V_random
|
|
|
|
diff = np.max(np.abs(V_new - V_random))
|
|
V_random = V_new
|
|
|
|
if diff < DELTA:
|
|
break
|
|
|
|
print(V_random)
|
|
|
|
# Exo 3: Bellman operator contraction
|
|
console.rule("[bold yellow]Exo 3: Bellman operator contraction")
|
|
|
|
V1 = np.random.rand(11)
|
|
V2 = np.random.rand(11)
|
|
diffs = []
|
|
|
|
while True:
|
|
V1 = REWARDS + DISCOUNT_FACTOR * P @ V1
|
|
V2 = REWARDS + DISCOUNT_FACTOR * P @ V2
|
|
|
|
diff = np.max(np.abs(V1 - V2))
|
|
diffs.append(diff)
|
|
|
|
if diff < DELTA:
|
|
break
|
|
|
|
print(V1)
|
|
print(V2)
|
|
|
|
plt.plot(diffs)
|
|
plt.yscale("log")
|
|
plt.title("Convergence of Bellman operator")
|
|
plt.xlabel("Iteration")
|
|
plt.ylabel("Difference")
|
|
# plt.show()
|
|
|
|
# Part 2: Optimization
|
|
console.rule("[bold white]Part 2: Optimization")
|
|
|
|
# Exo 1: Bellmann equation
|
|
console.rule("[bold yellow]Exo 1: Bellmann equation")
|
|
# cf cours, partie 2, slide 15
|
|
|
|
# Exo 2: Value Iteration Algorithm
|
|
console.rule("[bold yellow]Exo 2: Value Iteration Algorithm")
|
|
|
|
P_g = np.array([
|
|
[
|
|
0.8, 0.1, 0.0, 0.0,
|
|
0.1, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.7, 0.2, 0.1, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.7, 0.1, 0.1,
|
|
0.0, 0.1, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.1, 0.0, 0.0, 0.0,
|
|
0.8, 0.0, 0.0,
|
|
0.1, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.1, 0.0,
|
|
0.0, 0.7, 0.1,
|
|
0.0, 0.0, 0.1, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.1, 0.0, 0.0,
|
|
0.8, 0.1, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.7, 0.2, 0.1, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.1, 0.0,
|
|
0.0, 0.7, 0.1, 0.1,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.1,
|
|
0.0, 0.0, 0.7, 0.2,
|
|
],
|
|
])
|
|
|
|
P_h = np.array([
|
|
[
|
|
0.8, 0.1, 0.0, 0.0,
|
|
0.1, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.1, 0.8, 0.1, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.1, 0.7, 0.1,
|
|
0.0, 0.1, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.7, 0.0, 0.0, 0.0,
|
|
0.2, 0.0, 0.0,
|
|
0.1, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.7, 0.0,
|
|
0.0, 0.1, 0.1,
|
|
0.0, 0.0, 0.1, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.7, 0.0, 0.0,
|
|
0.2, 0.1, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.1, 0.8, 0.1, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.7, 0.0,
|
|
0.0, 0.1, 0.1, 0.1,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.7,
|
|
0.0, 0.0, 0.1, 0.2,
|
|
],
|
|
])
|
|
|
|
P_b = np.array([
|
|
[
|
|
0.2, 0.1, 0.0, 0.0,
|
|
0.7, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.1, 0.8, 0.1, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.1, 0.1, 0.1,
|
|
0.0, 0.7, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.1, 0.0, 0.0, 0.0,
|
|
0.2, 0.0, 0.0,
|
|
0.7, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.1, 0.0,
|
|
0.0, 0.1, 0.1,
|
|
0.0, 0.0, 0.7, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.1, 0.0, 0.0,
|
|
0.8, 0.1, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.1, 0.8, 0.1, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.1, 0.0,
|
|
0.0, 0.1, 0.7, 0.1,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.1,
|
|
0.0, 0.0, 0.1, 0.8,
|
|
],
|
|
])
|
|
|
|
P_d = np.array([
|
|
[
|
|
0.2, 0.7, 0.0, 0.0,
|
|
0.1, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.1, 0.2, 0.7, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.1, 0.1, 0.7,
|
|
0.0, 0.1, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.1, 0.0, 0.0, 0.0,
|
|
0.8, 0.0, 0.0,
|
|
0.1, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.1, 0.0,
|
|
0.0, 0.1, 0.7,
|
|
0.0, 0.0, 0.1, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.1, 0.0, 0.0,
|
|
0.2, 0.7, 0.0, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.0,
|
|
0.1, 0.2, 0.7, 0.0,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.1, 0.0,
|
|
0.0, 0.1, 0.1, 0.7,
|
|
],
|
|
[
|
|
0.0, 0.0, 0.0, 0.0,
|
|
0.0, 0.0, 0.1,
|
|
0.0, 0.0, 0.1, 0.8,
|
|
],
|
|
])
|
|
|
|
V_optimal = np.random.rand(11)
|
|
pi = np.zeros(11)
|
|
|
|
while True:
|
|
V_g = REWARDS + DISCOUNT_FACTOR * P_g @ V_optimal
|
|
V_h = REWARDS + DISCOUNT_FACTOR * P_h @ V_optimal
|
|
V_b = REWARDS + DISCOUNT_FACTOR * P_b @ V_optimal
|
|
V_d = REWARDS + DISCOUNT_FACTOR * P_d @ V_optimal
|
|
|
|
V_new = np.max([V_g, V_h, V_b, V_d], axis=0)
|
|
|
|
if np.allclose(V_new, V_optimal, atol=1e-6):
|
|
pi = np.argmax([V_g, V_h, V_b, V_d], axis=0)
|
|
break
|
|
else:
|
|
V_optimal = V_new
|
|
|
|
print(V_optimal)
|
|
|
|
pi_pretty = [ARROWS[i] for i in pi]
|
|
pi_pretty.insert(5, "■")
|
|
pi_pretty[3] = "✓"
|
|
pi_pretty[7] = "☠"
|
|
pi_pretty = np.array(pi_pretty).reshape(3, 4)
|
|
print(pi_pretty)
|
|
|
|
# Exo 4: Performance comparison
|
|
console.rule("[bold yellow]Exo 4: Performance comparison")
|
|
|
|
perf = np.abs(V_optimal - V_random)
|
|
print(perf) |