2023-06-23 18:10:32 +00:00
### A Pluto.jl notebook ###
2024-01-02 10:44:10 +00:00
# v0.19.36
2023-06-23 18:10:32 +00:00
#> [frontmatter]
#> title = " TP1 - Reinforcement learning "
#> date = "2022-12-14"
#> tags = ["RL"]
using Markdown
using InteractiveUtils
# This Pluto notebook uses @bind for interactivity. When running this notebook outside of Pluto, the following 'mock version' of @bind gives bound variables a default value (instead of an error).
macro bind ( def , element )
quote
local iv = try Base . loaded_modules [ Base . PkgId ( Base . UUID ( " 6e696c72-6542-2067-7265-42206c756150 " ) , " AbstractPlutoDingetjes " ) ] . Bonds . initial_value catch ; b -> missing ; end
local el = $ ( esc ( element ) )
global $ ( esc ( def ) ) = Core . applicable ( Base . get , el ) ? Base . get ( el ) : iv ( el )
el
end
end
2024-01-02 10:44:10 +00:00
# ╔═╡ 02b1e10c-653e-4660-90b5-2eae7f19f1f7
# ╠═╡ show_logs = false
# https://github.com/fonsp/Pluto.jl/wiki/%F0%9F%8E%81-Package-management#advanced-set-up-an-environment-with-pkgactivate
begin
using Pkg
Pkg . activate ( )
end
2023-06-23 18:10:32 +00:00
# ╔═╡ 26fdd17e-f03a-4835-93be-85303fe526d8
begin
using Plots # pour le tracé de figures
using PlutoUI # pour les objets Pluto
using LinearAlgebra # pour les matrices identité
using SparseArrays # pour les matrices creuses
using LaTeXStrings # pour les strings en LaTeX (pour les plots)
import Random
TableOfContents ( depth = 4 )
end
# ╔═╡ 56ac3473-24f4-42d7-84e1-cfce6a70d8d5
html """
< style >
pluto - output , img {
display : block ;
margin : auto ;
}
. sliders {
position : fixed ;
left : 2 rem ;
top : 40 % ;
z - index : 1000 ;
}
< / style >
"""
# ╔═╡ ccf4d63e-7ace-11ed-2123-d9dbb62bd308
html """
< center >
< strong style = " font-size: 2rem; " >
TP1 - Reinforcement learning < br / >
Laurent Fainsin < br / >
2021 - 2022
< / strong >
< / center >
"""
# ╔═╡ 9f2879c1-c22b-4067-ad20-4e4c56cc8d00
begin
REWARD_MOVE_slider = @bind REWARD_MOVE Slider ( - 0.1 : 0.01 : 0 , default = - 0.04 , show_value = true )
REWARD_GOAL_slider = @bind REWARD_GOAL Slider ( 0 : 1 : 10 , default = 1 , show_value = true )
REWARD_DEATH_slider = @bind REWARD_DEATH Slider ( - 10 : 1 : 0 , default = - 2 , show_value = true )
DISCOUNT_FACTOR_slider = @bind DISCOUNT_FACTOR Slider ( 0.9 : 0.01 : 0.99 , default = 0.9 , show_value = true )
div = html """ <div class= " sliders " > """
div_end = html """ </div> """
md """
$ ( div )
Hyper - paramètres :
REWARD \ _MOVE : $ ( REWARD_MOVE_slider )
REWARD \ _GOAL : $ ( REWARD_GOAL_slider )
REWARD \ _DEATH : $ ( REWARD_DEATH_slider )
DISCOUNT \ _FACTOR : $ ( DISCOUNT_FACTOR_slider )
$ ( div_end )
"""
end
# ╔═╡ 0a30a68a-068e-41fb-92c4-000869ba7dff
RANDOM_SEED = 420
# ╔═╡ 07b57746-fba0-49aa-ba17-6dcb0bbe44e5
MAX_ITERATIONS = 350
# ╔═╡ 92d6874b-651c-4551-840e-ad5d1e934aeb
MOVEMENTS = [
0.7 0.1 0.1 0.1
0.1 0.7 0.1 0.1
0.1 0.1 0.7 0.1
0.1 0.1 0.1 0.7
]
# ╔═╡ fe44d7f2-155e-42f2-83c3-dd18aadb3810
md """
On définit notre environnement comme une grille 3 x4 :
"""
# ╔═╡ 28b769a6-dd3c-43ab-bae0-646d8ebc35d6
begin
2024-01-02 10:44:10 +00:00
ARROW_SYMBOLS = [ " ⬅️ " , " ⬆️ " , " ⬇️ " , " ➡️ " ]
DEATH_SYMBOL = " ☠️ "
SUCCESS_SYMBOL = " 🏆 "
WALL_SYMBOL = " 🧱 "
EMPTY_SYMBOL = " 🟫 "
2023-06-23 18:10:32 +00:00
[
EMPTY_SYMBOL EMPTY_SYMBOL EMPTY_SYMBOL SUCCESS_SYMBOL
EMPTY_SYMBOL WALL_SYMBOL EMPTY_SYMBOL DEATH_SYMBOL
EMPTY_SYMBOL EMPTY_SYMBOL EMPTY_SYMBOL EMPTY_SYMBOL
]
end
# ╔═╡ 3881603c-619b-4976-ac4c-2c7e7f3a6ec7
md """
On peut définir nos rewards tels que :
"""
# ╔═╡ fb797a9b-6a0a-4a77-a9b6-6804f98639bb
begin
REWARDS = [
REWARD_MOVE , REWARD_MOVE , REWARD_MOVE , REWARD_GOAL ,
REWARD_MOVE , REWARD_MOVE , REWARD_DEATH ,
REWARD_MOVE , REWARD_MOVE , REWARD_MOVE , REWARD_MOVE ,
]
local REWARDS_display = copy ( REWARDS )
insert! ( REWARDS_display , 6 , 0 )
REWARDS_display = permutedims ( reshape ( REWARDS_display , 4 , 3 ) )
REWARDS_display = sparse ( REWARDS_display )
end
# ╔═╡ 1e3abda8-6645-48ba-874d-28e1011fc3e3
md """
# Performance Prediction
"""
# ╔═╡ beb410a8-03e2-4f18-8ccd-941cc926ee12
md """
## Question 1
> Assume the random policy , that is , the policy that takes every possible action with probability 1 / 4. Compute its value function by solving \
> $ V = ( I − \ gamma P ) ^ { - 1 } R $ . \
> Since there are 11 possible states in the problem , the vectors ` ` R ` ` and ` ` V ` ` have have length 11 , and the matrix ` ` P ` ` has dimension 11 x11 . There are two absorbing states , i . e . , they are visited once , and their respective reward ( + 1 or - 1 ) is only accrued once . To model this , you can simply put all 0 ’ s in all the elements of the respective two lines .
"""
# ╔═╡ 133f291f-6f21-4441-86f7-ba190a7d6b1f
md """
On définit une politique aléatoire ( à la main ) :
"""
# ╔═╡ e14f9977-d2fd-4d05-84d6-614008dc0c4a
[
ARROW_SYMBOLS [ 2 ] ARROW_SYMBOLS [ 1 ] ARROW_SYMBOLS [ 1 ] SUCCESS_SYMBOL
ARROW_SYMBOLS [ 1 ] WALL_SYMBOL ARROW_SYMBOLS [ 1 ] DEATH_SYMBOL
ARROW_SYMBOLS [ 1 ] ARROW_SYMBOLS [ 3 ] ARROW_SYMBOLS [ 2 ] ARROW_SYMBOLS [ 1 ]
]
# ╔═╡ 486c93ab-9cb9-4df4-b702-bbe12a961647
md """
Via nos probabilités de mouvements on peut alors constituer ` ` P ` ` :
"""
# ╔═╡ ab2d705d-fc00-43b2-bb6d-2a3d4ba9dab1
begin
P = [
[
0.8 , 0.1 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.7 , 0.2 , 0.1 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.7 , 0.1 , 0.1 ,
0.0 , 0.1 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.1 , 0.0 , 0.0 , 0.0 ,
0.8 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.1 , 0.0 ,
0.0 , 0.7 , 0.1 ,
0.0 , 0.0 , 0.1 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 ,
0.8 , 0.1 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.1 , 0.8 , 0.1 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.7 , 0.0 ,
0.0 , 0.1 , 0.1 , 0.1 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.1 ,
0.0 , 0.0 , 0.7 , 0.2 ,
] ,
]
P = sparse ( reduce ( hcat , P ) ' )
end
# ╔═╡ b7ae89c9-3c1b-4f5c-af5b-164d95ccca41
md """
On peut alors calculer ` ` V ` ` :
"""
# ╔═╡ 03c17428-5ab9-42e7-bf79-92eb846f11cb
begin
V = Matrix ( I ( length ( REWARDS ) ) - DISCOUNT_FACTOR * P ) \ REWARDS
local V_display = copy ( V )
insert! ( V_display , 6 , 0 )
V_display = permutedims ( reshape ( V_display , 4 , 3 ) )
V_display = sparse ( V_display )
end
# ╔═╡ c65d0dbc-ecd7-4320-9b3a-a1b9c0545f9a
md """
### Bonus
On contrôle que ` ` V ` ` vérifie l ' équation de Bellman en calculant une itération de l ' équation de Bellman :
$ V_ { \ text { next } } = R + \ gamma P V $
et en observant que ` ` V ` ` est un point fixe :
$ V_ { \ text { next } } = V $
On calcule alors ` ` V_ \ text { next } ` ` :
"""
# ╔═╡ ad547684-bcbe-44f4-9fc1-f327d2db4584
begin
V_next = REWARDS + Matrix ( DISCOUNT_FACTOR * P ) * V
local V_display = copy ( V_next )
insert! ( V_display , 6 , 0 )
V_display = permutedims ( reshape ( V_display , 4 , 3 ) )
V_display = sparse ( V_display )
end
# ╔═╡ d3703ab8-912c-417d-acd9-29590ec1134b
if isapprox ( V_next , V )
Markdown . MD ( Markdown . Admonition ( " correct " , " V est un point fixe " , [ md " L'équation de Bellman est vérifiée " ] ) ) ;
else
Markdown . MD ( Markdown . Admonition ( " danger " , " V n'est pas un point fixe " , [ md " L'équation de Bellman n'est vérifiée " ] ) ) ;
end
# ╔═╡ 1319b304-5126-4825-8076-e113e4dd3635
md """
## Question 2
> Evaluate now the policy using Iterative Policy Evaluation ( lecture 2 , 2 nd part , slides 11 / 35 ) , and verify that the algorithm converges to the result obtained in 1. To stop iterating , you can take as a criterion that the difference between two iterations must be smaller than some small ` ` \ delta ` ` . Due to the contraction principle , the initial vector can be arbitrary .
"""
# ╔═╡ 3ea3f177-c576-4b9e-a54b-c427e29a8491
md """
On initialise ` ` V_ \ text { random } \ in [ 0 , 1 ] ^ { 11 } ` ` aléatoirement .
On souhaite vérifier que ` ` V_ \ text { random } ` ` converge vers ` ` V ` ` par l ' évaluation itérative de la politique ` ` P ` ` .
"""
# ╔═╡ e94fe8a6-274b-4121-b1fc-063d3710c2f7
begin
Random . seed! ( RANDOM_SEED )
V_random = rand ( length ( REWARDS ) )
local diffs = Vector { Float64 } ( )
for _ in 1 : MAX_ITERATIONS
local V_old = V_random
2024-01-02 10:44:10 +00:00
global V_random = REWARDS + Matrix ( DISCOUNT_FACTOR * P ) * V_random
2023-06-23 18:10:32 +00:00
append! ( diffs , norm ( V_random - V_old ) )
if isapprox ( V_random , V_old )
break
end
end
plot (
diffs ,
labels = " " ,
xlabel = L " n " ,
ylabel = L " || V_{n+1} - V_n ||^2 " ,
yticks = [ 10.0 ^ - x for x in 0 : 10 ] ,
linewidth = 2 ,
yaxis = :log ,
title = " Iterative Policy Evaluation convergence " ,
)
end
# ╔═╡ 80090d5f-d56c-4844-a04f-444ed49e5f34
if isapprox ( V_random , V , rtol = 1e-5 )
Markdown . MD ( Markdown . Admonition ( " correct " , " L'évaluation itérative des politiques est vérifiée " , [ md " ``V_ \t ext{random}`` converge vers ``V`` " ] ) ) ;
else
Markdown . MD ( Markdown . Admonition ( " danger " , " L'évaluation itérative des politiques n'est pas vérifiée " , [ md " ``V_ \t ext{random}`` ne converge pas vers ``V`` " ] ) ) ;
end
# ╔═╡ 98362798-aae4-4540-9e98-cc7371802552
md """
## Question 3
> To verify that the Bellman operator is a contraction , take two initial vectors , and calculate the max of their differences . Then , apply the iterative policy evaluation to these 2 vectors as done in the previous item , and plot the maximum of their differences as you keep iterating . Observe what happens with the difference as you iterate , and explain it .
"""
# ╔═╡ 30874daf-7b0e-4335-9a50-d19389cf1620
md """
On initialise ` ` V_ { r1 } , V_ { r2 } \ in [ 0 , 1 ] ^ { 11 } ` ` aléatoirement .
On souhaite vérifier que ` ` V_ { r1 } ` ` converge vers ` ` V_ { r2 } ` ` ( et aussi vers ` ` V ` ` ) par l ' évaluation itérative de la politique ` ` P ` ` .
"""
# ╔═╡ c005a3f8-765c-4a50-90ef-73a5a72eee01
begin
Random . seed! ( RANDOM_SEED )
V_random1 = rand ( length ( REWARDS ) )
V_random2 = rand ( length ( REWARDS ) )
local diffs = Vector { Float64 } ( )
for _ in 1 : MAX_ITERATIONS
2024-01-02 10:44:10 +00:00
global V_random1 = REWARDS + Matrix ( DISCOUNT_FACTOR * P ) * V_random1
global V_random2 = REWARDS + Matrix ( DISCOUNT_FACTOR * P ) * V_random2
2023-06-23 18:10:32 +00:00
append! ( diffs , norm ( V_random1 - V_random2 ) )
if isapprox ( V_random1 , V_random2 )
break
end
end
plot (
diffs ,
labels = " " ,
xlabel = L " n " ,
ylabel = L " || V_{r1} - V_{r2} ||^2 " ,
yticks = [ 10.0 ^ - x for x in 0 : 10 ] ,
linewidth = 2 ,
yaxis = :log ,
title = " Bellman's operator contraction " ,
)
end
# ╔═╡ 1b43e9e5-d7d2-4b5e-a2b2-3a8b8eda6d62
if isapprox ( V_random1 , V_random2 , rtol = 0.01 )
Markdown . MD ( Markdown . Admonition ( " correct " , " On vérifie que l'opérateur de Bellman est une contraction " , [ md " ``V_{r1}`` converge vers ``V_{r2}`` " ] ) ) ;
else
Markdown . MD ( Markdown . Admonition ( " danger " , " On ne vérifie pas que l'opérateur de Bellman est une contraction " , [ md " ``V_{r1}`` ne converge pas vers ``V_{r2}`` " ] ) ) ;
end
# ╔═╡ add0221b-e352-4559-a722-c45a64f573f9
md """
# Optimization
"""
# ╔═╡ 84e07dce-bf6d-4ac1-bfa4-65414fe1d787
md """
## Question 1
> Write down the Bellman equation that characterizes the optimal policy .
"""
# ╔═╡ df13fa05-14de-409b-a0b1-5bba5eff432e
md """
Bellman Optimality Equation for ` ` V_ \ star ` ` and ` ` \ pi_ \ star ` ` :
$ V_ \ star ( s ) = \ max_ { a \ in A } \ left ( r ( s , a ) + \ gamma \ sum_ { s ' \ in S } p ( s ' | s , a ) V_ \ star ( s ' ) \ right ) $
$ \ pi_ \ star ( s ) \ in \ mathop { \ mathrm { argmax } } _ { a \ in A } \ left ( r ( s , a ) + \ gamma \ sum_ { s ' \ in S } p ( s ' | s , a ) V_ \ star ( s ' ) \ right ) $
"""
# ╔═╡ ac490e4a-ce20-4288-a04f-c224df5ade1a
md """
## Question 2
> Solve numerically the optimal value function by Value Iteration Algorithm ( lecture 2 , 2 nd part , slides 15 / 35 ) . Verify that the solution you obtain satisfies the Bellman equation .
"""
# ╔═╡ 33890f22-d3f6-4bcf-870d-756f7ff250a9
md """
` ` P_g ` ` la politique du déplacement toujours à gauche :
"""
# ╔═╡ cf9fb8a8-6c93-4c43-9f01-5f198f0cf4aa
begin
P_g = [
[
0.8 , 0.1 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.7 , 0.2 , 0.1 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.7 , 0.1 , 0.1 ,
0.0 , 0.1 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.1 , 0.0 , 0.0 , 0.0 ,
0.8 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.1 , 0.0 ,
0.0 , 0.7 , 0.1 ,
0.0 , 0.0 , 0.1 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 ,
0.8 , 0.1 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.7 , 0.2 , 0.1 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.1 , 0.0 ,
0.0 , 0.7 , 0.1 , 0.1 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.1 ,
0.0 , 0.0 , 0.7 , 0.2 ,
] ,
]
P_g = sparse ( reduce ( hcat , P_g ) ' )
end
# ╔═╡ dc87b85f-c87c-4302-9124-194bd799f1fd
md """
` ` P_h ` ` la politique du déplacement toujours en haut :
"""
# ╔═╡ b2595dec-aa5b-462b-b0f8-3555c1231b2f
begin
P_h = [
[
0.8 , 0.1 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.1 , 0.8 , 0.1 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.1 , 0.7 , 0.1 ,
0.0 , 0.1 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.7 , 0.0 , 0.0 , 0.0 ,
0.2 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.7 , 0.0 ,
0.0 , 0.1 , 0.1 ,
0.0 , 0.0 , 0.1 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.7 , 0.0 , 0.0 ,
0.2 , 0.1 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.1 , 0.8 , 0.1 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.7 , 0.0 ,
0.0 , 0.1 , 0.1 , 0.1 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.7 ,
0.0 , 0.0 , 0.1 , 0.2 ,
] ,
]
P_h = sparse ( reduce ( hcat , P_h ) ' )
end
# ╔═╡ 70edf811-adb0-4ae8-941a-b298d85a6e0e
md """
` ` P_b ` ` la politique du déplacement toujours en bas :
"""
# ╔═╡ 875673f1-08c9-4713-bbc2-85b0a7a0cb0a
begin
P_b = [
[
0.2 , 0.1 , 0.0 , 0.0 ,
0.7 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.1 , 0.8 , 0.1 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.1 , 0.1 , 0.1 ,
0.0 , 0.7 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.1 , 0.0 , 0.0 , 0.0 ,
0.2 , 0.0 , 0.0 ,
0.7 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.1 , 0.0 ,
0.0 , 0.1 , 0.1 ,
0.0 , 0.0 , 0.7 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 ,
0.8 , 0.1 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.1 , 0.8 , 0.1 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.1 , 0.0 ,
0.0 , 0.1 , 0.7 , 0.1 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.1 ,
0.0 , 0.0 , 0.1 , 0.8 ,
] ,
]
P_b = sparse ( reduce ( hcat , P_b ) ' )
end
# ╔═╡ 2deaac7c-ad14-43b0-9cd5-9f0ec12d324c
md """
` ` P_d ` ` la politique du déplacement toujours à droite :
"""
# ╔═╡ b5c93b6f-933c-41b4-8399-44cc0fa07fab
begin
P_d = [
[
0.2 , 0.7 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.1 , 0.2 , 0.7 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.1 , 0.1 , 0.7 ,
0.0 , 0.1 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.1 , 0.0 , 0.0 , 0.0 ,
0.8 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.1 , 0.0 ,
0.0 , 0.1 , 0.7 ,
0.0 , 0.0 , 0.1 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.1 , 0.0 , 0.0 ,
0.2 , 0.7 , 0.0 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 ,
0.1 , 0.2 , 0.7 , 0.0 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.1 , 0.0 ,
0.0 , 0.1 , 0.1 , 0.7 ,
] ,
[
0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.1 ,
0.0 , 0.0 , 0.1 , 0.8 ,
] ,
]
P_d = sparse ( reduce ( hcat , P_d ) ' )
end
# ╔═╡ 8015bdbb-82dd-48da-905d-a25e5c864298
md """
Pour trouver la politique optimal , on peut procéder de la façon suivante :
Initialiser ` ` V_ \ star ` ` ( random ) . \
Tant qu ' on a pas convergé ( ou atteint ` MAX_ITERATIONS ` ) :
- Calculer pour chaque direction ( gauche , haut , bas , droite ) le vecteur correspondant à la fonction valeur de la politique associée à la direction .
- Sélectionner notre nouvel ` V_optimal ` comme le maximum par ligne de nos vecteurs issus des fonctions valeur ( des quatre directions ) .
- Vérifier la convergeance par comparaison avec l ' itération précédente .
Par application de cet algorithme on obtient alors ` ` V_ \ star ` ` :
"""
# ╔═╡ 3d7d0b11-5b99-4b1f-ab06-3366678eece8
begin
Random . seed! ( RANDOM_SEED )
V_optimal = rand ( length ( REWARDS ) )
pi = zeros ( length ( REWARDS ) )
for _ in 1 : MAX_ITERATIONS
local V_g = REWARDS + Matrix ( DISCOUNT_FACTOR * P_g ) * V_optimal
local V_h = REWARDS + Matrix ( DISCOUNT_FACTOR * P_h ) * V_optimal
local V_b = REWARDS + Matrix ( DISCOUNT_FACTOR * P_b ) * V_optimal
local V_d = REWARDS + Matrix ( DISCOUNT_FACTOR * P_d ) * V_optimal
local V_new = maximum . ( eachrow ( [ V_g V_h V_b V_d ] ) )
if isapprox ( V_new , V_optimal )
pi = argmax . ( eachrow ( [ V_g V_h V_b V_d ] ) )
break
else
V_optimal = V_new
end
end
local V_display = copy ( V_optimal )
insert! ( V_display , 6 , 0 )
V_display = permutedims ( reshape ( V_display , 4 , 3 ) )
V_display = sparse ( V_display )
end
# ╔═╡ 664bb753-ccce-4c7a-8b11-76261a3b80d2
md """
## Question 3
> Explain how you can infer the optimal action in every state from the optimal value function ` ` V_ \ star ( s ) ` ` . Represent in a 2 D matrix the optimal policy .
"""
# ╔═╡ df01ea55-b289-4c13-8a6b-780ce068e44c
md """
La politique optimale se trouve en sélectionnant la direction la plus favorable dans chaque état :
"""
# ╔═╡ d7ff1cb5-d2b4-4597-bcef-0f74f2e7e0db
begin
pi_symbols = [ ARROW_SYMBOLS [ i ] for i in pi ]
insert! ( pi_symbols , 6 , WALL_SYMBOL )
pi_symbols [ 4 ] = SUCCESS_SYMBOL
pi_symbols [ 8 ] = DEATH_SYMBOL
permutedims ( reshape ( pi_symbols , 4 , 3 ) )
end
# ╔═╡ 40b7e793-d869-4b68-83a1-6bd7d20a3941
md """
## Question 4
> Compare the performances obtained with the random policy and the optimal one , how can you conclude that the optimal policy performs better ?
"""
# ╔═╡ dce3978b-1334-426e-80cc-9cfe63989909
md """
À partir ` ` \ pi ^ \ star ` ` on peut aussi trouver ` ` P ^ \ star ` ` la matrice de notre politique optimale :
"""
# ╔═╡ 7aae25dc-38cf-40d5-a7da-44d13d397194
begin
P_star = sparse ( zeros ( 11 , 11 ) )
for i in 1 : 11
if pi [ i ] == 1
P_star [ i , : ] = P_g [ i , : ]
elseif pi [ i ] == 2
P_star [ i , : ] = P_h [ i , : ]
elseif pi [ i ] == 3
P_star [ i , : ] = P_b [ i , : ]
else
P_star [ i , : ] = P_d [ i , : ]
end
end
P_star
end
# ╔═╡ b075f5fc-85ac-45a0-8e27-605d3dac0e97
begin
Random . seed! ( RANDOM_SEED )
V_Prandom = rand ( length ( REWARDS ) )
V_Poptimal = rand ( length ( REWARDS ) )
ratio = Vector { Float64 } ( )
convergence_random = Vector { Float64 } ( )
convergence_optimal = Vector { Float64 } ( )
for _ in 1 : MAX_ITERATIONS
V_Prandom = REWARDS + Matrix ( DISCOUNT_FACTOR * P ) * V_Prandom
V_Poptimal = REWARDS + Matrix ( DISCOUNT_FACTOR * P_star ) * V_Poptimal
append! ( convergence_optimal , norm ( V_Poptimal - V_optimal ) )
append! ( convergence_random , norm ( V_Prandom - V ) )
append! ( ratio , norm ( V_Poptimal ./ V_Prandom ) )
end
end
# ╔═╡ 1fe62967-a9ea-4f6a-817e-666a900c8f92
plot (
[ convergence_optimal , convergence_random ] ,
labels = [ " Optimal " " Random " ] ,
xlabel = L " n " ,
ylabel = L " || V^ \ star - \ \ V^r ||^2 " ,
yticks = [ 10.0 ^ - x for x in 0 : 20 ] ,
linewidth = 2 ,
yaxis = :log ,
title = " Optimal vs Random: Convergence " ,
)
# ╔═╡ f31ce9b6-8399-4263-bad7-20c859116fa9
begin
plot (
ratio ,
labels = " " ,
xlabel = L " n " ,
ylabel = L " || V^ \ star / \ \ V^r ||^2 " ,
linewidth = 2 ,
title = " Optimal vs Random: Ratio " ,
ylims = [ 0 , Inf ]
)
end
# ╔═╡ 05373383-0c51-49f2-8a62-b06a6225d659
md """
## Question 5
> * * Policy Iteration I * * : We are now going to calculate the optimal policy using Policy Iteration ( lecture 2 , 2 nd part , slides 23 / 35 and 24 / 35 ) . You can start with the random policy for which you calculated its performance in the * * Performance Prediction * * section . Carry out a one - step improvement ( or greedy step ) on the random policy . Represent in a 2 D matrix the policy you obtain . How can we verify that it is a better policy than the random one ?
"""
# ╔═╡ 81572e40-4cde-4a13-84aa-5c5d6a9dbde3
md """
0. Initialization : choose a policy ` ` \ pi_0 ` `
On reprend ici notre politique aléatoire ` ` P ^ { \ pi_0 } ` ` de la [ question 1 partie 1 ] ( #beb410a8-03e2-4f18-8ccd-941cc926ee12):
"""
# ╔═╡ 4b264154-944d-498b-a998-a4b07f77918e
begin
P_pi_0 = P
P_pi_0
end
# ╔═╡ a68a3d33-f4df-456e-af13-9b39e14dbc13
md """
2. Policy Evaluation : Compute iteratively ` ` V_ { \ pi_k } = ( I − \ gamma P ^ { \ pi_k } ) ^ { - 1 } R ^ { \ pi_k } ` `
( on calcule uniquement ` ` V_ { \ pi_0 } ` ` dans cette question )
"""
# ╔═╡ c3a6ab2c-7a3e-458f-a108-e6e81aa3def1
begin
V_pi_0 = Matrix ( I ( length ( REWARDS ) ) - DISCOUNT_FACTOR * P_pi_0 ) \ REWARDS
local V_display = copy ( V_pi_0 )
insert! ( V_display , 6 , 0 )
V_display = permutedims ( reshape ( V_display , 4 , 3 ) )
V_display = sparse ( V_display )
end
# ╔═╡ ea457cd9-0db5-433f-9d57-1e875a160990
md """
3. Policy improvement : Compute ` ` \ pi_ { k + 1 } = \ text { greedy } ( V_ { \ pi_k } ) ` `
( On calcule donc ici uniquement ` ` \ pi_1 ` ` )
"""
# ╔═╡ 3d62d11d-383c-4060-b697-be0c0155ce95
begin
local V_g = REWARDS + Matrix ( DISCOUNT_FACTOR * P_g ) * V_pi_0
local V_h = REWARDS + Matrix ( DISCOUNT_FACTOR * P_h ) * V_pi_0
local V_b = REWARDS + Matrix ( DISCOUNT_FACTOR * P_b ) * V_pi_0
local V_d = REWARDS + Matrix ( DISCOUNT_FACTOR * P_d ) * V_pi_0
local pi_1 = argmax . ( eachrow ( [ V_g V_h V_b V_d ] ) )
P_pi_1 = sparse ( zeros ( 11 , 11 ) )
for i in 1 : 11
if pi_1 [ i ] == 1
P_pi_1 [ i , : ] = P_g [ i , : ]
elseif pi_1 [ i ] == 2
P_pi_1 [ i , : ] = P_h [ i , : ]
elseif pi_1 [ i ] == 3
P_pi_1 [ i , : ] = P_b [ i , : ]
else
P_pi_1 [ i , : ] = P_d [ i , : ]
end
end
P_pi_1
end
# ╔═╡ 245f3394-d5e3-4d2c-96a6-ce5ea0bc7d84
md """
Stop if ` ` V_ { \ pi_ { k + 1 } } = V_ { \ pi_k } ` ` , else repeat
( Ici on s ' arrête comme le dit l ' énoncé pour k = 1 )
"""
# ╔═╡ 4f597447-f321-4a8f-adf0-3fd655ab203c
begin
diff_star_pi_0 = sum ( abs . ( P_star - P_pi_0 ) )
diff_star_pi_1 = sum ( abs . ( P_star - P_pi_1 ) )
md """
On peut vérifier que ` ` \ pi_1 ` ` est meilleur que ` ` \ pi_0 ` ` en calculant :
` ` || \ pi_ \ star - \ pi_1 || _ \ text { F } = ` ` $ ( diff_star_pi_1 )
` ` || \ pi_ \ star - \ pi_0 || _ \ text { F } = ` ` $ ( diff_star_pi_0 )
"""
end
# ╔═╡ d599c370-6cb5-4bc3-a333-d41e207c39dc
if diff_star_pi_1 <= diff_star_pi_0
Markdown . MD ( Markdown . Admonition ( " correct " , " On a une meilleur politique après une itération " , [ md " ``|| \ pi_ \ star - \ pi_1||_ \t ext{F} \ leq || \ pi_ \ star - \ pi_0||_ \t ext{F}`` " ] ) ) ;
else
Markdown . MD ( Markdown . Admonition ( " danger " , " On n'a pas une meilleur politique après une itération " , [ md " ``|| \ pi_ \ star - \ pi_1||_ \t ext{F} \n leq || \ pi_ \ star - \ pi_0||_ \t ext{F}`` " ] ) ) ;
end
# ╔═╡ 4e8e49b2-60ea-4dc6-906b-d459c7983b34
md """
## Question 6
> * * Policy Iteration II * * : Continue iterating the Prediction and the greedy steps until convergence to the optimal policy .
"""
# ╔═╡ 362a3786-f85d-44b9-b369-ecbf4e5194e9
begin
P_pi_k = P_pi_0
local diffs = Vector { Float64 } ( )
for k in 1 : MAX_ITERATIONS
V_pi_k = Matrix ( I ( length ( REWARDS ) ) - DISCOUNT_FACTOR * P_pi_k ) \ REWARDS
local V_g = REWARDS + Matrix ( DISCOUNT_FACTOR * P_g ) * V_pi_k
local V_h = REWARDS + Matrix ( DISCOUNT_FACTOR * P_h ) * V_pi_k
local V_b = REWARDS + Matrix ( DISCOUNT_FACTOR * P_b ) * V_pi_k
local V_d = REWARDS + Matrix ( DISCOUNT_FACTOR * P_d ) * V_pi_k
local pi_k = argmax . ( eachrow ( [ V_g V_h V_b V_d ] ) )
2024-01-02 10:44:10 +00:00
global P_pi_k = sparse ( zeros ( 11 , 11 ) )
2023-06-23 18:10:32 +00:00
for i in 1 : 11
if pi_k [ i ] == 1
P_pi_k [ i , : ] = P_g [ i , : ]
elseif pi_k [ i ] == 2
P_pi_k [ i , : ] = P_h [ i , : ]
elseif pi_k [ i ] == 3
P_pi_k [ i , : ] = P_b [ i , : ]
else
P_pi_k [ i , : ] = P_d [ i , : ]
end
end
append! ( diffs , sum ( abs . ( P_star - P_pi_k ) ) )
if isapprox ( P_star , P_pi_k )
break
end
end
local p = plot (
diffs ,
labels = " " ,
xlabel = L " k " ,
ylabel = L " || \ pi_ \ star - \ pi_k||_F " ,
linewidth = 2 ,
title = " Policy Iteration convergence " ,
)
xticks! ( round ( Int , xlims ( p ) [ 1 ] ) : round ( Int , xlims ( p ) [ 2 ] ) )
end
# ╔═╡ a1eaf48e-f92f-4554-942e-f6303ebaa084
md """
## Question 7
> Investigate the structure of the optimal policy for different values of ` ` \ gamma ` ` , and explain the results . You might use Value Iteration or Policy Iteration .
"""
# ╔═╡ 8d5b2cc2-2e21-47df-b821-189de5d357a3
begin
local gammas = 0.9 : 0.001 : 0.99
local iterations = zeros ( length ( gammas ) )
for ( i , gamma ) in enumerate ( gammas )
2024-01-02 10:44:10 +00:00
global P_pi_k2 = P_pi_0
2023-06-23 18:10:32 +00:00
k = 0
2024-01-02 10:44:10 +00:00
while true
V_pi_k = Matrix ( I ( length ( REWARDS ) ) - gamma * P_pi_k2 ) \ REWARDS
2023-06-23 18:10:32 +00:00
local V_g = REWARDS + Matrix ( gamma * P_g ) * V_pi_k
local V_h = REWARDS + Matrix ( gamma * P_h ) * V_pi_k
local V_b = REWARDS + Matrix ( gamma * P_b ) * V_pi_k
local V_d = REWARDS + Matrix ( gamma * P_d ) * V_pi_k
local pi_k = argmax . ( eachrow ( [ V_g V_h V_b V_d ] ) )
2024-01-02 10:44:10 +00:00
P_pi_k2 = sparse ( zeros ( 11 , 11 ) )
2023-06-23 18:10:32 +00:00
for i in 1 : 11
if pi_k [ i ] == 1
2024-01-02 10:44:10 +00:00
P_pi_k2 [ i , : ] = P_g [ i , : ]
2023-06-23 18:10:32 +00:00
elseif pi_k [ i ] == 2
2024-01-02 10:44:10 +00:00
P_pi_k2 [ i , : ] = P_h [ i , : ]
2023-06-23 18:10:32 +00:00
elseif pi_k [ i ] == 3
2024-01-02 10:44:10 +00:00
P_pi_k2 [ i , : ] = P_b [ i , : ]
2023-06-23 18:10:32 +00:00
else
2024-01-02 10:44:10 +00:00
P_pi_k2 [ i , : ] = P_d [ i , : ]
2023-06-23 18:10:32 +00:00
end
end
k += 1
2024-01-02 10:44:10 +00:00
if isapprox ( P_star , P_pi_k2 ) || k >= MAX_ITERATIONS
2023-06-23 18:10:32 +00:00
break
end
end
iterations [ i ] = k
end
local p = plot (
gammas ,
iterations ,
labels = " " ,
xlabel = L " \ gamma " ,
ylabel = L " k " ,
linetype = :steppre ,
linewidth = 2 ,
title = md " Policy Iteration convergence according to `` \ gamma`` " ,
)
yticks! ( round . ( Int , yticks ( p ) [ 1 ] [ 1 ] ) )
end
# ╔═╡ 0c6fd7ed-5180-41bd-9958-29cc9f3ce73b
md """
On observe qu ' il y a convergence de la politique généralement en dessous de 5 itérations . Cependant pour certaines combinaisons d ' hyperparamètres on remarque qu ' il n ' y a pas convergence .
"""
# ╔═╡ Cell order:
2024-01-02 10:44:10 +00:00
# ╟─02b1e10c-653e-4660-90b5-2eae7f19f1f7
# ╟─26fdd17e-f03a-4835-93be-85303fe526d8
2023-06-23 18:10:32 +00:00
# ╟─56ac3473-24f4-42d7-84e1-cfce6a70d8d5
# ╟─ccf4d63e-7ace-11ed-2123-d9dbb62bd308
# ╟─9f2879c1-c22b-4067-ad20-4e4c56cc8d00
# ╟─0a30a68a-068e-41fb-92c4-000869ba7dff
# ╟─07b57746-fba0-49aa-ba17-6dcb0bbe44e5
# ╟─92d6874b-651c-4551-840e-ad5d1e934aeb
# ╟─fe44d7f2-155e-42f2-83c3-dd18aadb3810
# ╟─28b769a6-dd3c-43ab-bae0-646d8ebc35d6
# ╟─3881603c-619b-4976-ac4c-2c7e7f3a6ec7
# ╟─fb797a9b-6a0a-4a77-a9b6-6804f98639bb
# ╟─1e3abda8-6645-48ba-874d-28e1011fc3e3
# ╟─beb410a8-03e2-4f18-8ccd-941cc926ee12
# ╟─133f291f-6f21-4441-86f7-ba190a7d6b1f
# ╟─e14f9977-d2fd-4d05-84d6-614008dc0c4a
# ╟─486c93ab-9cb9-4df4-b702-bbe12a961647
# ╟─ab2d705d-fc00-43b2-bb6d-2a3d4ba9dab1
# ╟─b7ae89c9-3c1b-4f5c-af5b-164d95ccca41
# ╟─03c17428-5ab9-42e7-bf79-92eb846f11cb
# ╟─c65d0dbc-ecd7-4320-9b3a-a1b9c0545f9a
# ╟─ad547684-bcbe-44f4-9fc1-f327d2db4584
2024-01-02 10:44:10 +00:00
# ╟─d3703ab8-912c-417d-acd9-29590ec1134b
2023-06-23 18:10:32 +00:00
# ╟─1319b304-5126-4825-8076-e113e4dd3635
# ╟─3ea3f177-c576-4b9e-a54b-c427e29a8491
2024-01-02 10:44:10 +00:00
# ╟─e94fe8a6-274b-4121-b1fc-063d3710c2f7
2023-06-23 18:10:32 +00:00
# ╟─80090d5f-d56c-4844-a04f-444ed49e5f34
# ╟─98362798-aae4-4540-9e98-cc7371802552
# ╟─30874daf-7b0e-4335-9a50-d19389cf1620
# ╟─c005a3f8-765c-4a50-90ef-73a5a72eee01
# ╟─1b43e9e5-d7d2-4b5e-a2b2-3a8b8eda6d62
# ╟─add0221b-e352-4559-a722-c45a64f573f9
# ╟─84e07dce-bf6d-4ac1-bfa4-65414fe1d787
# ╟─df13fa05-14de-409b-a0b1-5bba5eff432e
# ╟─ac490e4a-ce20-4288-a04f-c224df5ade1a
# ╟─33890f22-d3f6-4bcf-870d-756f7ff250a9
# ╟─cf9fb8a8-6c93-4c43-9f01-5f198f0cf4aa
# ╟─dc87b85f-c87c-4302-9124-194bd799f1fd
# ╟─b2595dec-aa5b-462b-b0f8-3555c1231b2f
# ╟─70edf811-adb0-4ae8-941a-b298d85a6e0e
# ╟─875673f1-08c9-4713-bbc2-85b0a7a0cb0a
# ╟─2deaac7c-ad14-43b0-9cd5-9f0ec12d324c
# ╟─b5c93b6f-933c-41b4-8399-44cc0fa07fab
# ╟─8015bdbb-82dd-48da-905d-a25e5c864298
# ╟─3d7d0b11-5b99-4b1f-ab06-3366678eece8
# ╟─664bb753-ccce-4c7a-8b11-76261a3b80d2
# ╟─df01ea55-b289-4c13-8a6b-780ce068e44c
# ╟─d7ff1cb5-d2b4-4597-bcef-0f74f2e7e0db
# ╟─40b7e793-d869-4b68-83a1-6bd7d20a3941
# ╟─dce3978b-1334-426e-80cc-9cfe63989909
# ╟─7aae25dc-38cf-40d5-a7da-44d13d397194
# ╟─b075f5fc-85ac-45a0-8e27-605d3dac0e97
# ╟─1fe62967-a9ea-4f6a-817e-666a900c8f92
# ╟─f31ce9b6-8399-4263-bad7-20c859116fa9
# ╟─05373383-0c51-49f2-8a62-b06a6225d659
# ╟─81572e40-4cde-4a13-84aa-5c5d6a9dbde3
# ╟─4b264154-944d-498b-a998-a4b07f77918e
# ╟─a68a3d33-f4df-456e-af13-9b39e14dbc13
# ╟─c3a6ab2c-7a3e-458f-a108-e6e81aa3def1
# ╟─ea457cd9-0db5-433f-9d57-1e875a160990
# ╟─3d62d11d-383c-4060-b697-be0c0155ce95
# ╟─245f3394-d5e3-4d2c-96a6-ce5ea0bc7d84
# ╟─4f597447-f321-4a8f-adf0-3fd655ab203c
# ╟─d599c370-6cb5-4bc3-a333-d41e207c39dc
# ╟─4e8e49b2-60ea-4dc6-906b-d459c7983b34
# ╟─362a3786-f85d-44b9-b369-ecbf4e5194e9
# ╟─a1eaf48e-f92f-4554-942e-f6303ebaa084
# ╟─8d5b2cc2-2e21-47df-b821-189de5d357a3
# ╟─0c6fd7ed-5180-41bd-9958-29cc9f3ce73b