import time import matplotlib.pyplot as plt import numpy as np from rich import print from rich.console import Console console = Console() REWARD_MOVE = -0.04 REWARD_GOAL = 1 REWARD_DEATH = -1 DISCOUNT_FACTOR = 0.9 ARROWS = ["←", "↑", "↓", "→"] MOVEMENTS = np.array([ [0.7, 0.1, 0.1, 0.1], [0.1, 0.7, 0.1, 0.1], [0.1, 0.1, 0.7, 0.1], [0.1, 0.1, 0.1, 0.7] ]) REWARDS = np.array([ REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_GOAL, REWARD_MOVE, REWARD_MOVE, REWARD_DEATH, REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, ]) # Part 1: Performance Prediction console.rule("[bold white]Part 1: Performance Prediction") # Exo 1: Random policy console.rule("[bold yellow]Exo 1: Random policy") P = np.array([ [ 0.8, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.7, 0.1, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.1, 0.0, 0.0, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.8, 0.1, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.8, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.0, 0.0, 0.1, 0.1, 0.1, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.2, ], ]) V = np.linalg.inv(np.eye(11) - DISCOUNT_FACTOR * P) @ REWARDS.flatten() print(V) V_new = REWARDS + DISCOUNT_FACTOR * P @ V if np.allclose(V, V_new): print("[green]V is a fixed point !") else: print("[red]You suck !") # Exo 2: Iterative Policy Evaluation console.rule("[bold yellow]Exo 2: Iterative Policy Evaluation") DELTA = 1e-6 V_random = np.random.rand(11) while True: V_new = REWARDS + DISCOUNT_FACTOR * P @ V_random diff = np.max(np.abs(V_new - V_random)) V_random = V_new if diff < DELTA: break print(V_random) # Exo 3: Bellman operator contraction console.rule("[bold yellow]Exo 3: Bellman operator contraction") V1 = np.random.rand(11) V2 = np.random.rand(11) diffs = [] while True: V1 = REWARDS + DISCOUNT_FACTOR * P @ V1 V2 = REWARDS + DISCOUNT_FACTOR * P @ V2 diff = np.max(np.abs(V1 - V2)) diffs.append(diff) if diff < DELTA: break print(V1) print(V2) plt.plot(diffs) plt.yscale("log") plt.title("Convergence of Bellman operator") plt.xlabel("Iteration") plt.ylabel("Difference") # plt.show() # Part 2: Optimization console.rule("[bold white]Part 2: Optimization") # Exo 1: Bellmann equation console.rule("[bold yellow]Exo 1: Bellmann equation") # cf cours, partie 2, slide 15 # Exo 2: Value Iteration Algorithm console.rule("[bold yellow]Exo 2: Value Iteration Algorithm") P_g = np.array([ [ 0.8, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.7, 0.1, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.1, 0.0, 0.0, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.8, 0.1, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.1, 0.1, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.2, ], ]) P_h = np.array([ [ 0.8, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.8, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.1, 0.7, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.7, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.7, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.7, 0.0, 0.0, 0.2, 0.1, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.8, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.0, 0.0, 0.1, 0.1, 0.1, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.0, 0.0, 0.1, 0.2, ], ]) P_b = np.array([ [ 0.2, 0.1, 0.0, 0.0, 0.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.8, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.1, 0.1, 0.1, 0.0, 0.7, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.7, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.7, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.8, 0.1, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.8, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.7, 0.1, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.8, ], ]) P_d = np.array([ [ 0.2, 0.7, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.2, 0.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.1, 0.1, 0.7, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.7, 0.0, 0.0, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.2, 0.7, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.7, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.1, 0.7, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.8, ], ]) V_optimal = np.random.rand(11) pi = np.zeros(11) while True: V_g = REWARDS + DISCOUNT_FACTOR * P_g @ V_optimal V_h = REWARDS + DISCOUNT_FACTOR * P_h @ V_optimal V_b = REWARDS + DISCOUNT_FACTOR * P_b @ V_optimal V_d = REWARDS + DISCOUNT_FACTOR * P_d @ V_optimal V_new = np.max([V_g, V_h, V_b, V_d], axis=0) if np.allclose(V_new, V_optimal, atol=1e-6): pi = np.argmax([V_g, V_h, V_b, V_d], axis=0) break else: V_optimal = V_new print(V_optimal) pi_pretty = [ARROWS[i] for i in pi] pi_pretty.insert(5, "■") pi_pretty[3] = "✓" pi_pretty[7] = "☠" pi_pretty = np.array(pi_pretty).reshape(3, 4) print(pi_pretty) # Exo 4: Performance comparison console.rule("[bold yellow]Exo 4: Performance comparison") perf = np.abs(V_optimal - V_random) print(perf)