三、序列问题（2）

上文中提及的精确解方法适用于小型离散问题，对于较大状态空间的问题，计算精确解需要极大的内存量，因而考虑近似解的方法。常使用approximate dynamic programming的方法去寻求近似解，进而使用在线方法实现实时计算。

2. 近似值函数

2.1 参数化表示

记值函数的参数化表示为$\mathcal{U}_{\theta} (s)$。

struct ApproximateValueIteration
    Uθ # initial parameterized value function that supports fit!
    S # set of discrete states for performing backups
    k_max # maximum number of iterations
end

function solve(M::ApproximateValueIteration, 𝒫::MDP)
    Uθ, S, k_max = M.Uθ, M.S, M.k_max
    for k in 1:k_max
        U = [backup(𝒫, Uθ, s) for s in S]
        fit!(Uθ, S, U)
    end
    return ValueFunctionPolicy(𝒫, Uθ)
end

接下来提及的所有参数表示均可与与上述逼近算法一起使用，且参数表示需要支持$\mathcal{U}_{\theta}$的计算以及$\mathcal{U}_{\theta} $与$S$中点效用估计的拟合。

参数化表示分为两类：

局部近似方法，其中$\theta$对应于$S$中状态的值。
全局近似方法，其中$\theta$与$S$中状态的值不直接相关。

但两者本质上都可以视为一个线性函数逼近，即$\mathcal{U}_{\theta} = \theta^{\rm T} \beta(s)$。

2.2 最邻近方法

mutable struct NearestNeighborValueFunction
    k # number of neighbors
    d # distance function d(s, s′)
    S # set of discrete states
    θ # vector of values at states in S
end

function (Uθ::NearestNeighborValueFunction)(s)
    dists = [Uθ.d(s,s′) for s′ in Uθ.S]
    ind = sortperm(dists)[1:Uθ.k]
    return mean(Uθ.θ[i] for i in ind)
end

function fit!(Uθ::NearestNeighborValueFunction, S, U)
    Uθ.θ = U
    return Uθ
end

2.3 核光滑方法

mutable struct LocallyWeightedValueFunction
    k # kernel function k(s, s′)
    S # set of discrete states
    θ # vector of values at states in S
end

function (Uθ::LocallyWeightedValueFunction)(s)
    w = normalize([Uθ.k(s,s′) for s′ in Uθ.S], 1)
    return Uθ.θ ⋅ w
end

function fit!(Uθ::LocallyWeightedValueFunction, S, U)
    Uθ.θ = U
    return Uθ
end

2.4 线性插值

在这里插入图片描述

mutable struct MultilinearValueFunction
    o # position of lower-left corner
    δ # vector of widths
    θ # vector of values at states in S
end

function (Uθ::MultilinearValueFunction)(s)
    o, δ, θ = Uθ.o, Uθ.δ, Uθ.θ
    Δ = (s - o)./δ
    # Multidimensional index of lower-left cell
    i = min.(floor.(Int, Δ) .+ 1, size(θ) .- 1)
    vertex_index = similar(i)
    d = length(s)
    u = 0.0
    for vertex in 0:2^d-1
        weight = 1.0
        for j in 1:d
        # Check whether jth bit is set
            if vertex & (1 << (j-1)) > 0
                vertex_index[j] = i[j] + 1
                weight *= Δ[j] - i[j] + 1
            else
                vertex_index[j] = i[j]
                weight *= i[j] - Δ[j]
            end
        end
        u += θ[vertex_index...]*weight
    end
    return u
end

function fit!(Uθ::MultilinearValueFunction, S, U)
    Uθ.θ = U
    return Uθ
end

2.5 单纯形插值

mutable struct SimplexValueFunction
    o # position of lower-left corner
    δ # vector of widths
    θ # vector of values at states in S
end

function (Uθ::SimplexValueFunction)(s)
    Δ = (s - Uθ.o)./Uθ.δ
    # Multidimensional index of upper-right cell
    i = min.(floor.(Int, Δ) .+ 1, size(Uθ.θ) .- 1) .+ 1
    u = 0.0
    s′ = (s - (Uθ.o + Uθ.δ.*(i.-2))) ./ Uθ.δ
    p = sortperm(s′) # increasing order
    w_tot = 0.0
    for j in p
        w = s′[j] - w_tot
        u += w*Uθ.θ[i...]
        i[j] -= 1
        w_tot += w
    end
    u += (1 - w_tot)*Uθ.θ[i...]
    return u
end

function fit!(Uθ::SimplexValueFunction, S, U)
    Uθ.θ = U
    return Uθ
end

2.6 线性回归与神经网络回归

下面介绍全局方法。线性回归需要一组线性函数作为基函数，如下：

mutable struct LinearRegressionValueFunction
    β # basis vector function
    θ # vector of parameters
end

function (Uθ::LinearRegressionValueFunction)(s)
    return Uθ.β(s) ⋅ Uθ.θ
end

function fit!(Uθ::LinearRegressionValueFunction, S, U)
    X = hcat([Uθ.β(s) for s in S]...)'
    Uθ.θ = pinv(X)*U
    return Uθ
end

神经网络回归不必按照线性回归的要求构造一组适当的基函数。相反，使用神经网络来表示值函数。

3. 在线规划

3.1 滚动时域规划（Receding Horizon Planning）

预测控制的优化不是一次离线进行，而是随着采样时刻的前进反复地在线进行，故而该方法面临着确定滚动深度的问题。这种优化虽然得不到理想的全局最优解，但是反复对每一采样时刻的偏差进行优化计算，将可及时地校正控制过程中出现的各种复杂情况。

3.2 Lookahead with Rollouts

struct RolloutLookahead
    𝒫 # problem
    π # rollout policy
    d # depth
end

randstep(𝒫::MDP, s, a) = 𝒫.TR(s, a)

function rollout(𝒫, s, π, d)
    ret = 0.0
    for t in 1:d
        a = π(s)
        s, r = randstep(𝒫, s, a)
        ret += 𝒫.γ^(t-1) * r
    end
    return ret
end

function (π::RolloutLookahead)(s)
    U(s) = rollout(π.𝒫, s, π.π, π.d)
    return greedy(π.𝒫, U, s).
end

3.3 正向搜索（Forward Search）

struct ForwardSearch
    𝒫 # problem
    d # depth
    U # value function at depth d
end

function forward_search(𝒫, s, d, U)
    if d ≤ 0
        return (a=nothing, u=U(s))
    end
    best = (a=nothing, u=-Inf)
    U′(s) = forward_search(𝒫, s, d-1, U).u
    for a in 𝒫.𝒜
        u = lookahead(𝒫, U′, s, a)
        if u > best.u
            best = (a=a, u=u)
        end
    end
    return best
end

(π::ForwardSearch)(s) = forward_search(π.𝒫, s, π.d, π.U).a

3.4 分支定界方法（Branch and Bound）

struct BranchAndBound
    𝒫 # problem
    d # depth
    Ulo # lower bound on value function at depth d
    Qhi # upper bound on action value function
end

function branch_and_bound(𝒫, s, d, Ulo, Qhi)
    if d ≤ 0
        return (a=nothing, u=Ulo(s))
    end
    U′(s) = branch_and_bound(𝒫, s, d-1, Ulo, Qhi).u
    best = (a=nothing, u=-Inf)
    for a in sort(𝒫.𝒜, by=a->Qhi(s,a), rev=true)
        if Qhi(s, a) < best.u
            return best # safe to prune
        end
        u = lookahead(𝒫, U′, s, a)
        if u > best.u
            best = (a=a, u=u)
        end
    end
    return best
end

(π::BranchAndBound)(s) = branch_and_bound(π.𝒫, s, π.d, π.Ulo, π.Qhi).a

3.5 稀疏采样

struct SparseSampling
    𝒫 # problem
    d # depth
    m # number of samples
    U # value function at depth d
end

function sparse_sampling(𝒫, s, d, m, U)
    if d ≤ 0
        return (a=nothing, u=U(s))
    end
    best = (a=nothing, u=-Inf)
    for a in 𝒫.𝒜
        u = 0.0
        for i in 1:m
            s′, r = randstep(𝒫, s, a)
            a′, u′ = sparse_sampling(𝒫, s′, d-1, m, U)
            u += (r + 𝒫.γ*u′) / m
        end
        if u > best.u
            best = (a=a, u=u)
        end
    end
    return best
end

(π::SparseSampling)(s) = sparse_sampling(π.𝒫, s, π.d, π.m, π.U).a

3.6 蒙特卡罗树搜索

struct MonteCarloTreeSearch
    𝒫 # problem
    N # visit counts
    Q # action value estimates
    d # depth
    m # number of simulations
    c # exploration constant
    U # value function estimate
end

function (π::MonteCarloTreeSearch)(s)
    for k in 1:π.m
        simulate!(π, s)
    end
    return argmax(a->π.Q[(s,a)], π.𝒫.𝒜)
end

3.7 启发式搜索

struct HeuristicSearch
    𝒫 # problem
    Uhi # upper bound on value function
    d # depth
    m # number of simulations
end

function simulate!(π::HeuristicSearch, U, s)
    𝒫 = π.𝒫
    for d in 1:π.d
        a, u = greedy(𝒫, U, s)
        U[s] = u
        s = rand(𝒫.T(s, a))
    end
end

function (π::HeuristicSearch)(s)
    U = [π.Uhi(s) for s in π.𝒫.𝒮]
    for i in 1:π.m
        simulate!(π, U, s)
    end
    return greedy(π.𝒫, U, s).a
end

3.8 标签启发式搜索

struct LabeledHeuristicSearch
    𝒫 # problem
    Uhi # upper bound on value function
    d # depth
    δ # gap threshold
end

function (π::LabeledHeuristicSearch)(s)
    U, solved = [π.Uhi(s) for s in 𝒫.𝒮], Set()
    while s ∉ solved
        simulate!(π, U, solved, s)
    end
    return greedy(π.𝒫, U, s).a
end

3.9 开环规划/model predictive control

开环规划可提供最佳闭环规划的满意近似，同时通过避免对未来信息的获取进行推理提高了计算效率。过程可表示为$$\max_{a_{1:d}} \mathcal{U}(a_{1:d}),$$即最大化是执行操作序列$a_{1:d}$时的预期返回。

确定性模型预测控制
$$\begin{align*} & \max_{a_{1:d}, s_{2:d}} \qquad \sum_{t = 1}^{d} \gamma^{t} R(s_{t}, a_{t}) \\ & {\rm s.t.} \qquad \qquad s_{t+1} = T(s_{t}, a_{t}), \ t \in 1:d-1. \end{align*}$$
鲁棒模型预测控制
$$\begin{align*} & \max_{a_{1:d}} \qquad \min_{s_{2:d}} \sum_{t = 1}^{d} \gamma^{t} R(s_{t}, a_{t}) \\ & {\rm s.t.} \qquad \quad s_{t+1} = T(s_{t}, a_{t}), \ t \in 1:d-1. \end{align*}$$
多预测模型预测控制
$$\begin{align*} & \max_{a_{1:d}^{1:m}, s_{2:d}^{i}} \qquad \frac{1}{m} \sum_{i=1}^{m}\sum_{k = 1}^{d} \gamma^{k} R(s_{k}^{(i)}, a_{k}^{(i)}) \\ & {\rm s.t.} \qquad \qquad s_{k+1}^{(i)} = T_{i}(s_{k}^{(i)}, a_{k}^{(i)}), \ k \in 1:d-1, i \in 1:m, \\ & \quad \qquad \qquad \ \ a_{1}^{(i)} = a_{1}^{(j)}, \qquad \qquad i, j \in 1:m. \end{align*}$$

【读书笔记】Algorithms for Decision Making（6）

三、序列问题（2）

2. 近似值函数

2.1 参数化表示

2.2 最邻近方法

2.3 核光滑方法

2.4 线性插值

2.5 单纯形插值

2.6 线性回归与神经网络回归

3. 在线规划

3.1 滚动时域规划（Receding Horizon Planning）

3.2 Lookahead with Rollouts

3.3 正向搜索（Forward Search）

3.4 分支定界方法（Branch and Bound）

3.5 稀疏采样

3.6 蒙特卡罗树搜索

3.7 启发式搜索

3.8 标签启发式搜索

3.9 开环规划/model predictive control

热门文章

最新文章

相关电子书

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

训练营

直播

乘风者计划

下载

镜像站

技术资料

【读书笔记】Algorithms for Decision Making（6）

三、序列问题（2）

2. 近似值函数

2.1 参数化表示

2.2 最邻近方法

2.3 核光滑方法

2.4 线性插值

2.5 单纯形插值

2.6 线性回归与神经网络回归

3. 在线规划

3.1 滚动时域规划（Receding Horizon Planning）

3.2 Lookahead with Rollouts

3.3 正向搜索（Forward Search）

3.4 分支定界方法（Branch and Bound）

3.5 稀疏采样

3.6 蒙特卡罗树搜索

3.7 启发式搜索

3.8 标签启发式搜索

3.9 开环规划/model predictive control

热门文章

最新文章

相关电子书