TP-reinforcement-learning/TP1/main.py
2023-06-23 20:10:32 +02:00

421 lines
8.2 KiB
Python

import time
import matplotlib.pyplot as plt
import numpy as np
from rich import print
from rich.console import Console
console = Console()
REWARD_MOVE = -0.04
REWARD_GOAL = 1
REWARD_DEATH = -1
DISCOUNT_FACTOR = 0.9
ARROWS = ["", "", "", ""]
MOVEMENTS = np.array([
[0.7, 0.1, 0.1, 0.1],
[0.1, 0.7, 0.1, 0.1],
[0.1, 0.1, 0.7, 0.1],
[0.1, 0.1, 0.1, 0.7]
])
REWARDS = np.array([
REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_GOAL,
REWARD_MOVE, REWARD_MOVE, REWARD_DEATH,
REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_MOVE,
])
# Part 1: Performance Prediction
console.rule("[bold white]Part 1: Performance Prediction")
# Exo 1: Random policy
console.rule("[bold yellow]Exo 1: Random policy")
P = np.array([
[
0.8, 0.1, 0.0, 0.0,
0.1, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.7, 0.2, 0.1, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.7, 0.1, 0.1,
0.0, 0.1, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.1, 0.0, 0.0, 0.0,
0.8, 0.0, 0.0,
0.1, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.1, 0.0,
0.0, 0.7, 0.1,
0.0, 0.0, 0.1, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.1, 0.0, 0.0,
0.8, 0.1, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.1, 0.8, 0.1, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.7, 0.0,
0.0, 0.1, 0.1, 0.1,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.1,
0.0, 0.0, 0.7, 0.2,
],
])
V = np.linalg.inv(np.eye(11) - DISCOUNT_FACTOR * P) @ REWARDS.flatten()
print(V)
V_new = REWARDS + DISCOUNT_FACTOR * P @ V
if np.allclose(V, V_new):
print("[green]V is a fixed point !")
else:
print("[red]You suck !")
# Exo 2: Iterative Policy Evaluation
console.rule("[bold yellow]Exo 2: Iterative Policy Evaluation")
DELTA = 1e-6
V_random = np.random.rand(11)
while True:
V_new = REWARDS + DISCOUNT_FACTOR * P @ V_random
diff = np.max(np.abs(V_new - V_random))
V_random = V_new
if diff < DELTA:
break
print(V_random)
# Exo 3: Bellman operator contraction
console.rule("[bold yellow]Exo 3: Bellman operator contraction")
V1 = np.random.rand(11)
V2 = np.random.rand(11)
diffs = []
while True:
V1 = REWARDS + DISCOUNT_FACTOR * P @ V1
V2 = REWARDS + DISCOUNT_FACTOR * P @ V2
diff = np.max(np.abs(V1 - V2))
diffs.append(diff)
if diff < DELTA:
break
print(V1)
print(V2)
plt.plot(diffs)
plt.yscale("log")
plt.title("Convergence of Bellman operator")
plt.xlabel("Iteration")
plt.ylabel("Difference")
# plt.show()
# Part 2: Optimization
console.rule("[bold white]Part 2: Optimization")
# Exo 1: Bellmann equation
console.rule("[bold yellow]Exo 1: Bellmann equation")
# cf cours, partie 2, slide 15
# Exo 2: Value Iteration Algorithm
console.rule("[bold yellow]Exo 2: Value Iteration Algorithm")
P_g = np.array([
[
0.8, 0.1, 0.0, 0.0,
0.1, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.7, 0.2, 0.1, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.7, 0.1, 0.1,
0.0, 0.1, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.1, 0.0, 0.0, 0.0,
0.8, 0.0, 0.0,
0.1, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.1, 0.0,
0.0, 0.7, 0.1,
0.0, 0.0, 0.1, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.1, 0.0, 0.0,
0.8, 0.1, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.7, 0.2, 0.1, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.1, 0.0,
0.0, 0.7, 0.1, 0.1,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.1,
0.0, 0.0, 0.7, 0.2,
],
])
P_h = np.array([
[
0.8, 0.1, 0.0, 0.0,
0.1, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.1, 0.8, 0.1, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.1, 0.7, 0.1,
0.0, 0.1, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.7, 0.0, 0.0, 0.0,
0.2, 0.0, 0.0,
0.1, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.7, 0.0,
0.0, 0.1, 0.1,
0.0, 0.0, 0.1, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.7, 0.0, 0.0,
0.2, 0.1, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.1, 0.8, 0.1, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.7, 0.0,
0.0, 0.1, 0.1, 0.1,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.7,
0.0, 0.0, 0.1, 0.2,
],
])
P_b = np.array([
[
0.2, 0.1, 0.0, 0.0,
0.7, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.1, 0.8, 0.1, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.1, 0.1, 0.1,
0.0, 0.7, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.1, 0.0, 0.0, 0.0,
0.2, 0.0, 0.0,
0.7, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.1, 0.0,
0.0, 0.1, 0.1,
0.0, 0.0, 0.7, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.1, 0.0, 0.0,
0.8, 0.1, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.1, 0.8, 0.1, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.1, 0.0,
0.0, 0.1, 0.7, 0.1,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.1,
0.0, 0.0, 0.1, 0.8,
],
])
P_d = np.array([
[
0.2, 0.7, 0.0, 0.0,
0.1, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.1, 0.2, 0.7, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.1, 0.1, 0.7,
0.0, 0.1, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.1, 0.0, 0.0, 0.0,
0.8, 0.0, 0.0,
0.1, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.1, 0.0,
0.0, 0.1, 0.7,
0.0, 0.0, 0.1, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.1, 0.0, 0.0,
0.2, 0.7, 0.0, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0,
0.1, 0.2, 0.7, 0.0,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.1, 0.0,
0.0, 0.1, 0.1, 0.7,
],
[
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.1,
0.0, 0.0, 0.1, 0.8,
],
])
V_optimal = np.random.rand(11)
pi = np.zeros(11)
while True:
V_g = REWARDS + DISCOUNT_FACTOR * P_g @ V_optimal
V_h = REWARDS + DISCOUNT_FACTOR * P_h @ V_optimal
V_b = REWARDS + DISCOUNT_FACTOR * P_b @ V_optimal
V_d = REWARDS + DISCOUNT_FACTOR * P_d @ V_optimal
V_new = np.max([V_g, V_h, V_b, V_d], axis=0)
if np.allclose(V_new, V_optimal, atol=1e-6):
pi = np.argmax([V_g, V_h, V_b, V_d], axis=0)
break
else:
V_optimal = V_new
print(V_optimal)
pi_pretty = [ARROWS[i] for i in pi]
pi_pretty.insert(5, "")
pi_pretty[3] = ""
pi_pretty[7] = ""
pi_pretty = np.array(pi_pretty).reshape(3, 4)
print(pi_pretty)
# Exo 4: Performance comparison
console.rule("[bold yellow]Exo 4: Performance comparison")
perf = np.abs(V_optimal - V_random)
print(perf)