### A Pluto.jl notebook ### # v0.19.36 #> [frontmatter] #> title = " TP1 - Reinforcement learning " #> date = "2022-12-14" #> tags = ["RL"] using Markdown using InteractiveUtils # This Pluto notebook uses @bind for interactivity. When running this notebook outside of Pluto, the following 'mock version' of @bind gives bound variables a default value (instead of an error). macro bind(def, element) quote local iv = try Base.loaded_modules[Base.PkgId(Base.UUID("6e696c72-6542-2067-7265-42206c756150"), "AbstractPlutoDingetjes")].Bonds.initial_value catch; b -> missing; end local el = $(esc(element)) global $(esc(def)) = Core.applicable(Base.get, el) ? Base.get(el) : iv(el) el end end # ╔═╡ 02b1e10c-653e-4660-90b5-2eae7f19f1f7 # ╠═╡ show_logs = false # https://github.com/fonsp/Pluto.jl/wiki/%F0%9F%8E%81-Package-management#advanced-set-up-an-environment-with-pkgactivate begin using Pkg Pkg.activate() end # ╔═╡ 26fdd17e-f03a-4835-93be-85303fe526d8 begin using Plots # pour le tracé de figures using PlutoUI # pour les objets Pluto using LinearAlgebra # pour les matrices identité using SparseArrays # pour les matrices creuses using LaTeXStrings # pour les strings en LaTeX (pour les plots) import Random TableOfContents(depth=4) end # ╔═╡ 56ac3473-24f4-42d7-84e1-cfce6a70d8d5 html""" """ # ╔═╡ ccf4d63e-7ace-11ed-2123-d9dbb62bd308 html"""
TP1 - Reinforcement learning
Laurent Fainsin
2021 - 2022
""" # ╔═╡ 9f2879c1-c22b-4067-ad20-4e4c56cc8d00 begin REWARD_MOVE_slider = @bind REWARD_MOVE Slider(-0.1:0.01:0, default=-0.04, show_value=true) REWARD_GOAL_slider = @bind REWARD_GOAL Slider(0:1:10, default=1, show_value=true) REWARD_DEATH_slider = @bind REWARD_DEATH Slider(-10:1:0, default=-2, show_value=true) DISCOUNT_FACTOR_slider = @bind DISCOUNT_FACTOR Slider(0.9:0.01:0.99, default=0.9, show_value=true) div = html"""
""" div_end = html"""
""" md""" $(div) Hyper-paramètres: REWARD\_MOVE: $(REWARD_MOVE_slider) REWARD\_GOAL: $(REWARD_GOAL_slider) REWARD\_DEATH: $(REWARD_DEATH_slider) DISCOUNT\_FACTOR: $(DISCOUNT_FACTOR_slider) $(div_end) """ end # ╔═╡ 0a30a68a-068e-41fb-92c4-000869ba7dff RANDOM_SEED = 420 # ╔═╡ 07b57746-fba0-49aa-ba17-6dcb0bbe44e5 MAX_ITERATIONS = 350 # ╔═╡ 92d6874b-651c-4551-840e-ad5d1e934aeb MOVEMENTS = [ 0.7 0.1 0.1 0.1 0.1 0.7 0.1 0.1 0.1 0.1 0.7 0.1 0.1 0.1 0.1 0.7 ] # ╔═╡ fe44d7f2-155e-42f2-83c3-dd18aadb3810 md""" On définit notre environnement comme une grille 3x4: """ # ╔═╡ 28b769a6-dd3c-43ab-bae0-646d8ebc35d6 begin ARROW_SYMBOLS = ["⬅️", "⬆️", "⬇️", "➡️"] DEATH_SYMBOL = "☠️" SUCCESS_SYMBOL = "🏆" WALL_SYMBOL = "🧱" EMPTY_SYMBOL = "🟫" [ EMPTY_SYMBOL EMPTY_SYMBOL EMPTY_SYMBOL SUCCESS_SYMBOL EMPTY_SYMBOL WALL_SYMBOL EMPTY_SYMBOL DEATH_SYMBOL EMPTY_SYMBOL EMPTY_SYMBOL EMPTY_SYMBOL EMPTY_SYMBOL ] end # ╔═╡ 3881603c-619b-4976-ac4c-2c7e7f3a6ec7 md""" On peut définir nos rewards tels que: """ # ╔═╡ fb797a9b-6a0a-4a77-a9b6-6804f98639bb begin REWARDS = [ REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_GOAL, REWARD_MOVE, REWARD_MOVE, REWARD_DEATH, REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, REWARD_MOVE, ] local REWARDS_display = copy(REWARDS) insert!(REWARDS_display, 6, 0) REWARDS_display = permutedims(reshape(REWARDS_display, 4, 3)) REWARDS_display = sparse(REWARDS_display) end # ╔═╡ 1e3abda8-6645-48ba-874d-28e1011fc3e3 md""" # Performance Prediction """ # ╔═╡ beb410a8-03e2-4f18-8ccd-941cc926ee12 md""" ## Question 1 > Assume the random policy, that is, the policy that takes every possible action with probability 1/4. Compute its value function by solving \ > $V = (I − \gamma P )^{-1} R$. \ > Since there are 11 possible states in the problem, the vectors ``R`` and ``V`` have have length 11, and the matrix ``P`` has dimension 11x11. There are two absorbing states, i.e., they are visited once, and their respective reward (+1 or -1) is only accrued once. To model this, you can simply put all 0’s in all the elements of the respective two lines. """ # ╔═╡ 133f291f-6f21-4441-86f7-ba190a7d6b1f md""" On définit une politique aléatoire (à la main): """ # ╔═╡ e14f9977-d2fd-4d05-84d6-614008dc0c4a [ ARROW_SYMBOLS[2] ARROW_SYMBOLS[1] ARROW_SYMBOLS[1] SUCCESS_SYMBOL ARROW_SYMBOLS[1] WALL_SYMBOL ARROW_SYMBOLS[1] DEATH_SYMBOL ARROW_SYMBOLS[1] ARROW_SYMBOLS[3] ARROW_SYMBOLS[2] ARROW_SYMBOLS[1] ] # ╔═╡ 486c93ab-9cb9-4df4-b702-bbe12a961647 md""" Via nos probabilités de mouvements on peut alors constituer ``P``: """ # ╔═╡ ab2d705d-fc00-43b2-bb6d-2a3d4ba9dab1 begin P = [ [ 0.8, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.7, 0.1, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.1, 0.0, 0.0, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.8, 0.1, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.8, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.0, 0.0, 0.1, 0.1, 0.1, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.2, ], ] P = sparse(reduce(hcat, P)') end # ╔═╡ b7ae89c9-3c1b-4f5c-af5b-164d95ccca41 md""" On peut alors calculer ``V``: """ # ╔═╡ 03c17428-5ab9-42e7-bf79-92eb846f11cb begin V = Matrix(I(length(REWARDS)) - DISCOUNT_FACTOR * P) \ REWARDS local V_display = copy(V) insert!(V_display, 6, 0) V_display = permutedims(reshape(V_display, 4, 3)) V_display = sparse(V_display) end # ╔═╡ c65d0dbc-ecd7-4320-9b3a-a1b9c0545f9a md""" ### Bonus On contrôle que ``V`` vérifie l'équation de Bellman en calculant une itération de l'équation de Bellman: $V_{\text{next}} = R + \gamma P V$ et en observant que ``V`` est un point fixe: $V_{\text{next}} = V$ On calcule alors ``V_\text{next}``: """ # ╔═╡ ad547684-bcbe-44f4-9fc1-f327d2db4584 begin V_next = REWARDS + Matrix(DISCOUNT_FACTOR * P) * V local V_display = copy(V_next) insert!(V_display, 6, 0) V_display = permutedims(reshape(V_display, 4, 3)) V_display = sparse(V_display) end # ╔═╡ d3703ab8-912c-417d-acd9-29590ec1134b if isapprox(V_next, V) Markdown.MD(Markdown.Admonition("correct", "V est un point fixe", [md"L'équation de Bellman est vérifiée"])); else Markdown.MD(Markdown.Admonition("danger", "V n'est pas un point fixe", [md"L'équation de Bellman n'est vérifiée"])); end # ╔═╡ 1319b304-5126-4825-8076-e113e4dd3635 md""" ## Question 2 > Evaluate now the policy using Iterative Policy Evaluation (lecture 2, 2nd part, slides 11/35), and verify that the algorithm converges to the result obtained in 1. To stop iterating, you can take as a criterion that the difference between two iterations must be smaller than some small ``\delta``. Due to the contraction principle, the initial vector can be arbitrary. """ # ╔═╡ 3ea3f177-c576-4b9e-a54b-c427e29a8491 md""" On initialise ``V_\text{random} \in [0, 1]^{11}`` aléatoirement. On souhaite vérifier que ``V_\text{random}`` converge vers ``V`` par l'évaluation itérative de la politique ``P``. """ # ╔═╡ e94fe8a6-274b-4121-b1fc-063d3710c2f7 begin Random.seed!(RANDOM_SEED) V_random = rand(length(REWARDS)) local diffs = Vector{Float64}() for _ in 1:MAX_ITERATIONS local V_old = V_random global V_random = REWARDS + Matrix(DISCOUNT_FACTOR * P) * V_random append!(diffs, norm(V_random - V_old)) if isapprox(V_random, V_old) break end end plot( diffs, labels = "", xlabel = L"n", ylabel = L"|| V_{n+1} - V_n ||^2", yticks=[10.0^-x for x in 0:10], linewidth=2, yaxis=:log, title="Iterative Policy Evaluation convergence", ) end # ╔═╡ 80090d5f-d56c-4844-a04f-444ed49e5f34 if isapprox(V_random, V, rtol=1e-5) Markdown.MD(Markdown.Admonition("correct", "L'évaluation itérative des politiques est vérifiée", [md"``V_\text{random}`` converge vers ``V``"])); else Markdown.MD(Markdown.Admonition("danger", "L'évaluation itérative des politiques n'est pas vérifiée", [md"``V_\text{random}`` ne converge pas vers ``V``"])); end # ╔═╡ 98362798-aae4-4540-9e98-cc7371802552 md""" ## Question 3 > To verify that the Bellman operator is a contraction, take two initial vectors, and calculate the max of their differences. Then, apply the iterative policy evaluation to these 2 vectors as done in the previous item, and plot the maximum of their differences as you keep iterating. Observe what happens with the difference as you iterate, and explain it. """ # ╔═╡ 30874daf-7b0e-4335-9a50-d19389cf1620 md""" On initialise ``V_{r1}, V_{r2} \in [0, 1]^{11}`` aléatoirement. On souhaite vérifier que ``V_{r1}`` converge vers ``V_{r2}`` (et aussi vers ``V``) par l'évaluation itérative de la politique ``P``. """ # ╔═╡ c005a3f8-765c-4a50-90ef-73a5a72eee01 begin Random.seed!(RANDOM_SEED) V_random1 = rand(length(REWARDS)) V_random2 = rand(length(REWARDS)) local diffs = Vector{Float64}() for _ in 1:MAX_ITERATIONS global V_random1 = REWARDS + Matrix(DISCOUNT_FACTOR * P) * V_random1 global V_random2 = REWARDS + Matrix(DISCOUNT_FACTOR * P) * V_random2 append!(diffs, norm(V_random1 - V_random2)) if isapprox(V_random1, V_random2) break end end plot( diffs, labels = "", xlabel = L"n", ylabel = L"|| V_{r1} - V_{r2} ||^2", yticks=[10.0^-x for x in 0:10], linewidth=2, yaxis=:log, title="Bellman's operator contraction", ) end # ╔═╡ 1b43e9e5-d7d2-4b5e-a2b2-3a8b8eda6d62 if isapprox(V_random1, V_random2, rtol=0.01) Markdown.MD(Markdown.Admonition("correct", "On vérifie que l'opérateur de Bellman est une contraction", [md"``V_{r1}`` converge vers ``V_{r2}``"])); else Markdown.MD(Markdown.Admonition("danger", "On ne vérifie pas que l'opérateur de Bellman est une contraction", [md"``V_{r1}`` ne converge pas vers ``V_{r2}``"])); end # ╔═╡ add0221b-e352-4559-a722-c45a64f573f9 md""" # Optimization """ # ╔═╡ 84e07dce-bf6d-4ac1-bfa4-65414fe1d787 md""" ## Question 1 > Write down the Bellman equation that characterizes the optimal policy. """ # ╔═╡ df13fa05-14de-409b-a0b1-5bba5eff432e md""" Bellman Optimality Equation for ``V_\star`` and ``\pi_\star``: $V_\star(s) = \max_{a \in A} \left( r(s,a) + \gamma \sum_{s' \in S} p(s' | s, a) V_\star(s') \right)$ $\pi_\star(s) \in \mathop{\mathrm{argmax}}_{a \in A} \left( r(s,a) + \gamma \sum_{s' \in S} p(s' | s, a) V_\star(s') \right)$ """ # ╔═╡ ac490e4a-ce20-4288-a04f-c224df5ade1a md""" ## Question 2 > Solve numerically the optimal value function by Value Iteration Algorithm (lecture 2, 2nd part, slides 15/35). Verify that the solution you obtain satisfies the Bellman equation. """ # ╔═╡ 33890f22-d3f6-4bcf-870d-756f7ff250a9 md""" ``P_g`` la politique du déplacement toujours à gauche: """ # ╔═╡ cf9fb8a8-6c93-4c43-9f01-5f198f0cf4aa begin P_g = [ [ 0.8, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.7, 0.1, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.1, 0.0, 0.0, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.8, 0.1, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.1, 0.1, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.7, 0.2, ], ] P_g = sparse(reduce(hcat, P_g)') end # ╔═╡ dc87b85f-c87c-4302-9124-194bd799f1fd md""" ``P_h`` la politique du déplacement toujours en haut: """ # ╔═╡ b2595dec-aa5b-462b-b0f8-3555c1231b2f begin P_h = [ [ 0.8, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.8, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.1, 0.7, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.7, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.7, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.7, 0.0, 0.0, 0.2, 0.1, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.8, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.0, 0.0, 0.1, 0.1, 0.1, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.0, 0.0, 0.1, 0.2, ], ] P_h = sparse(reduce(hcat, P_h)') end # ╔═╡ 70edf811-adb0-4ae8-941a-b298d85a6e0e md""" ``P_b`` la politique du déplacement toujours en bas: """ # ╔═╡ 875673f1-08c9-4713-bbc2-85b0a7a0cb0a begin P_b = [ [ 0.2, 0.1, 0.0, 0.0, 0.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.8, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.1, 0.1, 0.1, 0.0, 0.7, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.7, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.7, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.8, 0.1, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.8, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.7, 0.1, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.8, ], ] P_b = sparse(reduce(hcat, P_b)') end # ╔═╡ 2deaac7c-ad14-43b0-9cd5-9f0ec12d324c md""" ``P_d`` la politique du déplacement toujours à droite: """ # ╔═╡ b5c93b6f-933c-41b4-8399-44cc0fa07fab begin P_d = [ [ 0.2, 0.7, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.2, 0.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.1, 0.1, 0.7, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.1, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.7, 0.0, 0.0, 0.1, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.2, 0.7, 0.0, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.2, 0.7, 0.0, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.1, 0.7, ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.8, ], ] P_d = sparse(reduce(hcat, P_d)') end # ╔═╡ 8015bdbb-82dd-48da-905d-a25e5c864298 md""" Pour trouver la politique optimal, on peut procéder de la façon suivante: Initialiser ``V_\star`` (random). \ Tant qu'on a pas convergé (ou atteint `MAX_ITERATIONS`): - Calculer pour chaque direction (gauche, haut, bas, droite) le vecteur correspondant à la fonction valeur de la politique associée à la direction. - Sélectionner notre nouvel `V_optimal` comme le maximum par ligne de nos vecteurs issus des fonctions valeur (des quatre directions). - Vérifier la convergeance par comparaison avec l'itération précédente. Par application de cet algorithme on obtient alors ``V_\star``: """ # ╔═╡ 3d7d0b11-5b99-4b1f-ab06-3366678eece8 begin Random.seed!(RANDOM_SEED) V_optimal = rand(length(REWARDS)) pi = zeros(length(REWARDS)) for _ in 1:MAX_ITERATIONS local V_g = REWARDS + Matrix(DISCOUNT_FACTOR * P_g) * V_optimal local V_h = REWARDS + Matrix(DISCOUNT_FACTOR * P_h) * V_optimal local V_b = REWARDS + Matrix(DISCOUNT_FACTOR * P_b) * V_optimal local V_d = REWARDS + Matrix(DISCOUNT_FACTOR * P_d) * V_optimal local V_new = maximum.(eachrow([V_g V_h V_b V_d])) if isapprox(V_new, V_optimal) pi = argmax.(eachrow([V_g V_h V_b V_d])) break else V_optimal = V_new end end local V_display = copy(V_optimal) insert!(V_display, 6, 0) V_display = permutedims(reshape(V_display, 4, 3)) V_display = sparse(V_display) end # ╔═╡ 664bb753-ccce-4c7a-8b11-76261a3b80d2 md""" ## Question 3 > Explain how you can infer the optimal action in every state from the optimal value function ``V_\star(s)``. Represent in a 2D matrix the optimal policy. """ # ╔═╡ df01ea55-b289-4c13-8a6b-780ce068e44c md""" La politique optimale se trouve en sélectionnant la direction la plus favorable dans chaque état: """ # ╔═╡ d7ff1cb5-d2b4-4597-bcef-0f74f2e7e0db begin pi_symbols = [ARROW_SYMBOLS[i] for i in pi] insert!(pi_symbols, 6, WALL_SYMBOL) pi_symbols[4] = SUCCESS_SYMBOL pi_symbols[8] = DEATH_SYMBOL permutedims(reshape(pi_symbols, 4, 3)) end # ╔═╡ 40b7e793-d869-4b68-83a1-6bd7d20a3941 md""" ## Question 4 > Compare the performances obtained with the random policy and the optimal one, how can you conclude that the optimal policy performs better ? """ # ╔═╡ dce3978b-1334-426e-80cc-9cfe63989909 md""" À partir ``\pi^\star`` on peut aussi trouver ``P^\star`` la matrice de notre politique optimale: """ # ╔═╡ 7aae25dc-38cf-40d5-a7da-44d13d397194 begin P_star = sparse(zeros(11, 11)) for i in 1:11 if pi[i] == 1 P_star[i, :] = P_g[i, :] elseif pi[i] == 2 P_star[i, :] = P_h[i, :] elseif pi[i] == 3 P_star[i, :] = P_b[i, :] else P_star[i, :] = P_d[i, :] end end P_star end # ╔═╡ b075f5fc-85ac-45a0-8e27-605d3dac0e97 begin Random.seed!(RANDOM_SEED) V_Prandom = rand(length(REWARDS)) V_Poptimal = rand(length(REWARDS)) ratio = Vector{Float64}() convergence_random = Vector{Float64}() convergence_optimal = Vector{Float64}() for _ in 1:MAX_ITERATIONS V_Prandom = REWARDS + Matrix(DISCOUNT_FACTOR * P) * V_Prandom V_Poptimal = REWARDS + Matrix(DISCOUNT_FACTOR * P_star) * V_Poptimal append!(convergence_optimal, norm(V_Poptimal-V_optimal)) append!(convergence_random, norm(V_Prandom-V)) append!(ratio, norm(V_Poptimal./V_Prandom)) end end # ╔═╡ 1fe62967-a9ea-4f6a-817e-666a900c8f92 plot( [convergence_optimal, convergence_random], labels = ["Optimal" "Random"], xlabel = L"n", ylabel = L"|| V^\star - \ \ V^r ||^2", yticks=[10.0^-x for x in 0:20], linewidth=2, yaxis=:log, title="Optimal vs Random: Convergence", ) # ╔═╡ f31ce9b6-8399-4263-bad7-20c859116fa9 begin plot( ratio, labels = "", xlabel = L"n", ylabel = L"|| V^\star / \ \ V^r ||^2", linewidth=2, title="Optimal vs Random: Ratio", ylims=[0, Inf] ) end # ╔═╡ 05373383-0c51-49f2-8a62-b06a6225d659 md""" ## Question 5 > **Policy Iteration I**: We are now going to calculate the optimal policy using Policy Iteration (lecture 2, 2nd part, slides 23/35 and 24/35). You can start with the random policy for which you calculated its performance in the **Performance Prediction** section. Carry out a one-step improvement (or greedy step) on the random policy. Represent in a 2D matrix the policy you obtain. How can we verify that it is a better policy than the random one? """ # ╔═╡ 81572e40-4cde-4a13-84aa-5c5d6a9dbde3 md""" 0. Initialization: choose a policy ``\pi_0`` On reprend ici notre politique aléatoire ``P^{\pi_0}`` de la [question 1 partie 1](#beb410a8-03e2-4f18-8ccd-941cc926ee12): """ # ╔═╡ 4b264154-944d-498b-a998-a4b07f77918e begin P_pi_0 = P P_pi_0 end # ╔═╡ a68a3d33-f4df-456e-af13-9b39e14dbc13 md""" 2. Policy Evaluation: Compute iteratively ``V_{\pi_k} = (I − \gamma P^{\pi_k} )^{-1} R^{\pi_k}`` (on calcule uniquement ``V_{\pi_0}`` dans cette question) """ # ╔═╡ c3a6ab2c-7a3e-458f-a108-e6e81aa3def1 begin V_pi_0 = Matrix(I(length(REWARDS)) - DISCOUNT_FACTOR * P_pi_0) \ REWARDS local V_display = copy(V_pi_0) insert!(V_display, 6, 0) V_display = permutedims(reshape(V_display, 4, 3)) V_display = sparse(V_display) end # ╔═╡ ea457cd9-0db5-433f-9d57-1e875a160990 md""" 3. Policy improvement: Compute ``\pi_{k+1} = \text{greedy}(V_{\pi_k})`` (On calcule donc ici uniquement ``\pi_1`` ) """ # ╔═╡ 3d62d11d-383c-4060-b697-be0c0155ce95 begin local V_g = REWARDS + Matrix(DISCOUNT_FACTOR * P_g) * V_pi_0 local V_h = REWARDS + Matrix(DISCOUNT_FACTOR * P_h) * V_pi_0 local V_b = REWARDS + Matrix(DISCOUNT_FACTOR * P_b) * V_pi_0 local V_d = REWARDS + Matrix(DISCOUNT_FACTOR * P_d) * V_pi_0 local pi_1 = argmax.(eachrow([V_g V_h V_b V_d])) P_pi_1 = sparse(zeros(11, 11)) for i in 1:11 if pi_1[i] == 1 P_pi_1[i, :] = P_g[i, :] elseif pi_1[i] == 2 P_pi_1[i, :] = P_h[i, :] elseif pi_1[i] == 3 P_pi_1[i, :] = P_b[i, :] else P_pi_1[i, :] = P_d[i, :] end end P_pi_1 end # ╔═╡ 245f3394-d5e3-4d2c-96a6-ce5ea0bc7d84 md""" Stop if ``V_{\pi_{k+1}} = V_{\pi_k}``, else repeat (Ici on s'arrête comme le dit l'énoncé pour k=1) """ # ╔═╡ 4f597447-f321-4a8f-adf0-3fd655ab203c begin diff_star_pi_0 = sum(abs.(P_star - P_pi_0)) diff_star_pi_1 = sum(abs.(P_star - P_pi_1)) md""" On peut vérifier que ``\pi_1`` est meilleur que ``\pi_0`` en calculant: ``||\pi_\star - \pi_1||_\text{F} = `` $(diff_star_pi_1) ``||\pi_\star - \pi_0||_\text{F} = `` $(diff_star_pi_0) """ end # ╔═╡ d599c370-6cb5-4bc3-a333-d41e207c39dc if diff_star_pi_1 <= diff_star_pi_0 Markdown.MD(Markdown.Admonition("correct", "On a une meilleur politique après une itération", [md"``||\pi_\star - \pi_1||_\text{F} \leq ||\pi_\star - \pi_0||_\text{F}``"])); else Markdown.MD(Markdown.Admonition("danger", "On n'a pas une meilleur politique après une itération", [md"``||\pi_\star - \pi_1||_\text{F} \nleq ||\pi_\star - \pi_0||_\text{F}``"])); end # ╔═╡ 4e8e49b2-60ea-4dc6-906b-d459c7983b34 md""" ## Question 6 > **Policy Iteration II**: Continue iterating the Prediction and the greedy steps until convergence to the optimal policy. """ # ╔═╡ 362a3786-f85d-44b9-b369-ecbf4e5194e9 begin P_pi_k = P_pi_0 local diffs = Vector{Float64}() for k in 1:MAX_ITERATIONS V_pi_k = Matrix(I(length(REWARDS)) - DISCOUNT_FACTOR * P_pi_k) \ REWARDS local V_g = REWARDS + Matrix(DISCOUNT_FACTOR * P_g) * V_pi_k local V_h = REWARDS + Matrix(DISCOUNT_FACTOR * P_h) * V_pi_k local V_b = REWARDS + Matrix(DISCOUNT_FACTOR * P_b) * V_pi_k local V_d = REWARDS + Matrix(DISCOUNT_FACTOR * P_d) * V_pi_k local pi_k = argmax.(eachrow([V_g V_h V_b V_d])) global P_pi_k = sparse(zeros(11, 11)) for i in 1:11 if pi_k[i] == 1 P_pi_k[i, :] = P_g[i, :] elseif pi_k[i] == 2 P_pi_k[i, :] = P_h[i, :] elseif pi_k[i] == 3 P_pi_k[i, :] = P_b[i, :] else P_pi_k[i, :] = P_d[i, :] end end append!(diffs, sum(abs.(P_star - P_pi_k))) if isapprox(P_star, P_pi_k) break end end local p = plot( diffs, labels = "", xlabel = L"k", ylabel = L"||\pi_\star - \pi_k||_F", linewidth=2, title="Policy Iteration convergence", ) xticks!(round(Int,xlims(p)[1]):round(Int,xlims(p)[2])) end # ╔═╡ a1eaf48e-f92f-4554-942e-f6303ebaa084 md""" ## Question 7 > Investigate the structure of the optimal policy for different values of ``\gamma``, and explain the results. You might use Value Iteration or Policy Iteration. """ # ╔═╡ 8d5b2cc2-2e21-47df-b821-189de5d357a3 begin local gammas = 0.9:0.001:0.99 local iterations = zeros(length(gammas)) for (i, gamma) in enumerate(gammas) global P_pi_k2 = P_pi_0 k = 0 while true V_pi_k = Matrix(I(length(REWARDS)) - gamma * P_pi_k2) \ REWARDS local V_g = REWARDS + Matrix(gamma * P_g) * V_pi_k local V_h = REWARDS + Matrix(gamma * P_h) * V_pi_k local V_b = REWARDS + Matrix(gamma * P_b) * V_pi_k local V_d = REWARDS + Matrix(gamma * P_d) * V_pi_k local pi_k = argmax.(eachrow([V_g V_h V_b V_d])) P_pi_k2 = sparse(zeros(11, 11)) for i in 1:11 if pi_k[i] == 1 P_pi_k2[i, :] = P_g[i, :] elseif pi_k[i] == 2 P_pi_k2[i, :] = P_h[i, :] elseif pi_k[i] == 3 P_pi_k2[i, :] = P_b[i, :] else P_pi_k2[i, :] = P_d[i, :] end end k += 1 if isapprox(P_star, P_pi_k2) || k >= MAX_ITERATIONS break end end iterations[i] = k end local p = plot( gammas, iterations, labels = "", xlabel = L"\gamma", ylabel = L"k", linetype=:steppre, linewidth=2, title=md"Policy Iteration convergence according to ``\gamma``", ) yticks!(round.(Int, yticks(p)[1][1])) end # ╔═╡ 0c6fd7ed-5180-41bd-9958-29cc9f3ce73b md""" On observe qu'il y a convergence de la politique généralement en dessous de 5 itérations. Cependant pour certaines combinaisons d'hyperparamètres on remarque qu'il n'y a pas convergence. """ # ╔═╡ Cell order: # ╟─02b1e10c-653e-4660-90b5-2eae7f19f1f7 # ╟─26fdd17e-f03a-4835-93be-85303fe526d8 # ╟─56ac3473-24f4-42d7-84e1-cfce6a70d8d5 # ╟─ccf4d63e-7ace-11ed-2123-d9dbb62bd308 # ╟─9f2879c1-c22b-4067-ad20-4e4c56cc8d00 # ╟─0a30a68a-068e-41fb-92c4-000869ba7dff # ╟─07b57746-fba0-49aa-ba17-6dcb0bbe44e5 # ╟─92d6874b-651c-4551-840e-ad5d1e934aeb # ╟─fe44d7f2-155e-42f2-83c3-dd18aadb3810 # ╟─28b769a6-dd3c-43ab-bae0-646d8ebc35d6 # ╟─3881603c-619b-4976-ac4c-2c7e7f3a6ec7 # ╟─fb797a9b-6a0a-4a77-a9b6-6804f98639bb # ╟─1e3abda8-6645-48ba-874d-28e1011fc3e3 # ╟─beb410a8-03e2-4f18-8ccd-941cc926ee12 # ╟─133f291f-6f21-4441-86f7-ba190a7d6b1f # ╟─e14f9977-d2fd-4d05-84d6-614008dc0c4a # ╟─486c93ab-9cb9-4df4-b702-bbe12a961647 # ╟─ab2d705d-fc00-43b2-bb6d-2a3d4ba9dab1 # ╟─b7ae89c9-3c1b-4f5c-af5b-164d95ccca41 # ╟─03c17428-5ab9-42e7-bf79-92eb846f11cb # ╟─c65d0dbc-ecd7-4320-9b3a-a1b9c0545f9a # ╟─ad547684-bcbe-44f4-9fc1-f327d2db4584 # ╟─d3703ab8-912c-417d-acd9-29590ec1134b # ╟─1319b304-5126-4825-8076-e113e4dd3635 # ╟─3ea3f177-c576-4b9e-a54b-c427e29a8491 # ╟─e94fe8a6-274b-4121-b1fc-063d3710c2f7 # ╟─80090d5f-d56c-4844-a04f-444ed49e5f34 # ╟─98362798-aae4-4540-9e98-cc7371802552 # ╟─30874daf-7b0e-4335-9a50-d19389cf1620 # ╟─c005a3f8-765c-4a50-90ef-73a5a72eee01 # ╟─1b43e9e5-d7d2-4b5e-a2b2-3a8b8eda6d62 # ╟─add0221b-e352-4559-a722-c45a64f573f9 # ╟─84e07dce-bf6d-4ac1-bfa4-65414fe1d787 # ╟─df13fa05-14de-409b-a0b1-5bba5eff432e # ╟─ac490e4a-ce20-4288-a04f-c224df5ade1a # ╟─33890f22-d3f6-4bcf-870d-756f7ff250a9 # ╟─cf9fb8a8-6c93-4c43-9f01-5f198f0cf4aa # ╟─dc87b85f-c87c-4302-9124-194bd799f1fd # ╟─b2595dec-aa5b-462b-b0f8-3555c1231b2f # ╟─70edf811-adb0-4ae8-941a-b298d85a6e0e # ╟─875673f1-08c9-4713-bbc2-85b0a7a0cb0a # ╟─2deaac7c-ad14-43b0-9cd5-9f0ec12d324c # ╟─b5c93b6f-933c-41b4-8399-44cc0fa07fab # ╟─8015bdbb-82dd-48da-905d-a25e5c864298 # ╟─3d7d0b11-5b99-4b1f-ab06-3366678eece8 # ╟─664bb753-ccce-4c7a-8b11-76261a3b80d2 # ╟─df01ea55-b289-4c13-8a6b-780ce068e44c # ╟─d7ff1cb5-d2b4-4597-bcef-0f74f2e7e0db # ╟─40b7e793-d869-4b68-83a1-6bd7d20a3941 # ╟─dce3978b-1334-426e-80cc-9cfe63989909 # ╟─7aae25dc-38cf-40d5-a7da-44d13d397194 # ╟─b075f5fc-85ac-45a0-8e27-605d3dac0e97 # ╟─1fe62967-a9ea-4f6a-817e-666a900c8f92 # ╟─f31ce9b6-8399-4263-bad7-20c859116fa9 # ╟─05373383-0c51-49f2-8a62-b06a6225d659 # ╟─81572e40-4cde-4a13-84aa-5c5d6a9dbde3 # ╟─4b264154-944d-498b-a998-a4b07f77918e # ╟─a68a3d33-f4df-456e-af13-9b39e14dbc13 # ╟─c3a6ab2c-7a3e-458f-a108-e6e81aa3def1 # ╟─ea457cd9-0db5-433f-9d57-1e875a160990 # ╟─3d62d11d-383c-4060-b697-be0c0155ce95 # ╟─245f3394-d5e3-4d2c-96a6-ce5ea0bc7d84 # ╟─4f597447-f321-4a8f-adf0-3fd655ab203c # ╟─d599c370-6cb5-4bc3-a333-d41e207c39dc # ╟─4e8e49b2-60ea-4dc6-906b-d459c7983b34 # ╟─362a3786-f85d-44b9-b369-ecbf4e5194e9 # ╟─a1eaf48e-f92f-4554-942e-f6303ebaa084 # ╟─8d5b2cc2-2e21-47df-b821-189de5d357a3 # ╟─0c6fd7ed-5180-41bd-9958-29cc9f3ce73b