【读书笔记】Algorithms for Decision Making(14)

简介: 本部分将简单游戏扩展到具有多个状态的连续上下文。马尔可夫博弈可以看作是多个具有自己奖励函数的智能体的马尔可夫决策过程。

五、多智能体系统(3)

本部分将简单游戏扩展到具有多个状态的连续上下文。马尔可夫博弈可以看作是多个具有自己奖励函数的智能体的马尔可夫决策过程。


3. 状态不确定

3.1 Partially Observable Markov Games

POMG可以看作是MG到部分可观测性的扩展,也可以看作是POMDP到多个代理的扩展。

struct POMG
    γ # discount factor
    ℐ # agents
    𝒮 # state space
    𝒜 # joint action space
    𝒪 # joint observation space
    T # transition function
    O # joint observation function
    R # joint reward function
end

3.2 策略进化

3.2.1 基于树的条件规划的策略

function lookahead(𝒫::POMG, U, s, a)
    𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, joint(𝒫.𝒪), 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
    u′ = sum(T(s,a,s′)*sum(O(a,s′,o)*U(o,s′) for o in 𝒪) for s′ in 𝒮)
    return R(s,a) + γ*u′
end

function evaluate_plan(𝒫::POMG, π, s)
    a = Tuple(πi() for πi in π)
    U(o,s′) = evaluate_plan(𝒫, [πi(oi) for (πi, oi) in zip(π,o)], s′)
    return isempty(first(π).subplans) ? 𝒫.R(s,a) : lookahead(𝒫, U, s, a)
end

function utility(𝒫::POMG, b, π)
    u = [evaluate_plan(𝒫, π, s) for s in 𝒫.𝒮]
    return sum(bs * us for (bs, us) in zip(b, u))
end

3.2.2 基于图的控制器的策略

在这里插入图片描述

3.3 Nash 均衡

struct POMGNashEquilibrium
    b # initial belief
    d # depth of conditional plans
end

function create_conditional_plans(𝒫, d)
    ℐ, 𝒜, 𝒪 = 𝒫.ℐ, 𝒫.𝒜, 𝒫.𝒪
    Π = [[ConditionalPlan(ai) for ai in 𝒜[i]] for i in ℐ]
    for t in 1:d
        Π = expand_conditional_plans(𝒫, Π)
    end
    return Π
end

function expand_conditional_plans(𝒫, Π)
    ℐ, 𝒜, 𝒪 = 𝒫.ℐ, 𝒫.𝒜, 𝒫.𝒪
    return [[ConditionalPlan(ai, Dict(oi => πi for oi in 𝒪[i]))
    for πi in Π[i] for ai in 𝒜[i]] for i in ℐ]
end

function solve(M::POMGNashEquilibrium, 𝒫::POMG)
    ℐ, γ, b, d = 𝒫.ℐ, 𝒫.γ, M.b, M.d
    Π = create_conditional_plans(𝒫, d)
    U = Dict(π => utility(𝒫, b, π) for π in joint(Π))
    𝒢 = SimpleGame(γ, ℐ, Π, π -> U[π])
    π = solve(NashEquilibrium(), 𝒢)
    return Tuple(argmax(πi.p) for πi in π)
end

3.4 动态规划

struct POMGDynamicProgramming
    b # initial belief
    d # depth of conditional plans
end

function solve(M::POMGDynamicProgramming, 𝒫::POMG)
    ℐ, 𝒮, 𝒜, R, γ, b, d = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.γ, M.b, M.d
    Π = [[ConditionalPlan(ai) for ai in 𝒜[i]] for i in ℐ]
    for t in 1:d
        Π = expand_conditional_plans(𝒫, Π)
        prune_dominated!(Π, 𝒫)
    end
    𝒢 = SimpleGame(γ, ℐ, Π, π -> utility(𝒫, b, π))
    π = solve(NashEquilibrium(), 𝒢)
    return Tuple(argmax(πi.p) for πi in π)
end

function prune_dominated!(Π, 𝒫::POMG)
    done = false
    while !done
        done = true
        for i in shuffle(𝒫.ℐ)
            for πi in shuffle(Π[i])
                if length(Π[i]) > 1 && is_dominated(𝒫, Π, i, πi)
                    filter!(πi′ -> πi′ ≠ πi, Π[i])
                    done = false
                    break
                end
            end
        end
    end
end

function is_dominated(𝒫::POMG, Π, i, πi)
    ℐ, 𝒮 = 𝒫.ℐ, 𝒫.𝒮
    jointΠnoti = joint([Π[j] for j in ℐ if j ≠ i])
    π(πi′, πnoti) = [j==i ? πi′ : πnoti[j>i ? j-1 : j] for j in ℐ]
    Ui = Dict((πi′, πnoti, s) => evaluate_plan(𝒫, π(πi′, πnoti), s)[i]
        for πi′ in Π[i], πnoti in jointΠnoti, s in 𝒮)
    model = Model(Ipopt.Optimizer)
    @variable(model, δ)
    @variable(model, b[jointΠnoti, 𝒮] ≥ 0)        
    @objective(model, Max, δ)
    @constraint(model, [πi′=Π[i]],
        sum(b[πnoti, s] * (Ui[πi′, πnoti, s] - Ui[πi, πnoti, s])
        for πnoti in jointΠnoti for s in 𝒮) ≥ δ)
    @constraint(model, sum(b) == 1)
    optimize!(model)
    return value(δ) ≥ 0
end

4. Decentralized Partially Observable Markov Decision Processes

Dec-POMDP是所有智能体都共享相同目标的POMG。

struct DecPOMDP
    γ # discount factor
    ℐ # agents
    𝒮 # state space
    𝒜 # joint action space
    𝒪 # joint observation space
    T # transition function
    O # joint observation function
    R # reward function
end

4.1 Subclass

在这里插入图片描述

4.2 算法

4.2.1 动态规划

struct DecPOMDPDynamicProgramming
    b # initial belief
    d # depth of conditional plans
end

function solve(M::DecPOMDPDynamicProgramming, 𝒫::DecPOMDP)
    ℐ, 𝒮, 𝒜, 𝒪, T, O, R, γ = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
    R′(s, a) = [R(s, a) for i in ℐ]
    𝒫′ = POMG(γ, ℐ, 𝒮, 𝒜, 𝒪, T, O, R′)
    M′ = POMGDynamicProgramming(M.b, M.d)
    return solve(M′, 𝒫′)
end

4.2.2 迭代最佳响应

struct DecPOMDPIteratedBestResponse
    b # initial belief
    d # depth of conditional plans
    k_max # number of iterations
end

function solve(M::DecPOMDPIteratedBestResponse, 𝒫::DecPOMDP)
    ℐ, 𝒮, 𝒜, 𝒪, T, O, R, γ = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
    b, d, k_max = M.b, M.d, M.k_max
    R′(s, a) = [R(s, a) for i in ℐ]
    𝒫′ = POMG(γ, ℐ, 𝒮, 𝒜, 𝒪, T, O, R′)
    Π = create_conditional_plans(𝒫, d)
    π = [rand(Π[i]) for i in ℐ]
    for k in 1:k_max
        for i in shuffle(ℐ)
            π′(πi) = Tuple(j == i ? πi : π[j] for j in ℐ)
            Ui(πi) = utility(𝒫′, b, π′(πi))[i]
            π[i] = argmax(Ui, Π[i])
        end
    end
    return Tuple(π)
end

4.2.3 Heuristic Search

struct DecPOMDPHeuristicSearch
    b # initial belief
    d # depth of conditional plans
    π_max # number of policies
end

function solve(M::DecPOMDPHeuristicSearch, 𝒫::DecPOMDP)
    ℐ, 𝒮, 𝒜, 𝒪, T, O, R, γ = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
    b, d, π_max = M.b, M.d, M.π_max
    R′(s, a) = [R(s, a) for i in ℐ]
    𝒫′ = POMG(γ, ℐ, 𝒮, 𝒜, 𝒪, T, O, R′)
    Π = [[ConditionalPlan(ai) for ai in 𝒜[i]] for i in ℐ]
    for t in 1:d
        allΠ = expand_conditional_plans(𝒫, Π)
        Π = [[] for i in ℐ]
        for z in 1:π_max
            b′ = explore(M, 𝒫, t)
            π = argmax(π -> first(utility(𝒫′, b′, π)), joint(allΠ))
            for i in ℐ
                push!(Π[i], π[i])
                filter!(πi -> πi != π[i], allΠ[i])
            end
        end
    end
    return argmax(π -> first(utility(𝒫′, b, π)), joint(Π))
end

function explore(M::DecPOMDPHeuristicSearch, 𝒫::DecPOMDP, t)
    ℐ, 𝒮, 𝒜, 𝒪, T, O, R, γ = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
    b = copy(M.b)
    b′ = similar(b)
    s = rand(SetCategorical(𝒮, b))
    for τ in 1:t
        a = Tuple(rand(𝒜i) for 𝒜i in 𝒜)
        s′ = rand(SetCategorical(𝒮, [T(s,a,s′) for s′ in 𝒮]))
        o = rand(SetCategorical(joint(𝒪), [O(a,s′,o) for o in joint(𝒪)]))
        for (i′, s′) in enumerate(𝒮)
            po = O(a, s′, o)
            b′[i′] = po*sum(T(s,a,s′)*b[i] for (i,s) in enumerate(𝒮))
        end
        normalize!(b′, 1)
        b, s = b′, s′
    end
    return b′
end

4.2.4 非线性规划

struct DecPOMDPNonlinearProgramming
    b # initial belief
    ℓ # number of nodes for each agent
end

function tensorform(𝒫::DecPOMDP)
    ℐ, 𝒮, 𝒜, 𝒪, R, T, O = 𝒫.ℐ, 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.R, 𝒫.T, 𝒫.O
    ℐ′ = eachindex(ℐ)
    𝒮′ = eachindex(𝒮)
    𝒜′ = [eachindex(𝒜i) for 𝒜i in 𝒜]
    𝒪′ = [eachindex(𝒪i) for 𝒪i in 𝒪]
    R′ = [R(s,a) for s in 𝒮, a in joint(𝒜)]
    T′ = [T(s,a,s′) for s in 𝒮, a in joint(𝒜), s′ in 𝒮]
    O′ = [O(a,s′,o) for a in joint(𝒜), s′ in 𝒮, o in joint(𝒪)]
    return ℐ′, 𝒮′, 𝒜′, 𝒪′, R′, T′, O′
end

function solve(M::DecPOMDPNonlinearProgramming, 𝒫::DecPOMDP)
    𝒫, γ, b = 𝒫, 𝒫.γ, M.b
    ℐ, 𝒮, 𝒜, 𝒪, R, T, O = tensorform(𝒫)
    X = [collect(1:M.ℓ) for i in ℐ]
    jointX, joint𝒜, joint𝒪 = joint(X), joint(𝒜), joint(𝒪)
    x1 = jointX[1]
    model = Model(Ipopt.Optimizer)
    @variable(model, U[jointX,𝒮])
    @variable(model, ψ[i=ℐ,X[i],𝒜[i]] ≥ 0)
    @variable(model, η[i=ℐ,X[i],𝒜[i],𝒪[i],X[i]] ≥ 0)
    @objective(model, Max, b⋅U[x1,:])
    @NLconstraint(model, [x=jointX,s=𝒮],
        U[x,s] == (sum(prod(ψ[i,x[i],a[i]] for i in ℐ)
            *(R[s,y] + γ*sum(T[s,y,s′]*sum(O[y,s′,z]
                *sum(prod(η[i,x[i],a[i],o[i],x′[i]] for i in ℐ)
                    *U[x′,s′] for x′ in jointX)
                for (z, o) in enumerate(joint𝒪)) for s′ in 𝒮))
            for (y, a) in enumerate(joint𝒜))))
    @constraint(model, [i=ℐ,xi=X[i]], sum(ψ[i,xi,ai] for ai in 𝒜[i]) == 1)
    @constraint(model, [i=ℐ,xi=X[i],ai=𝒜[i],oi=𝒪[i]], 
        sum(η[i,xi,ai,oi,xi′] for xi′ in X[i]) == 1)
    optimize!(model)
    ψ′, η′ = value.(ψ), value.(η)
    return [ControllerPolicy(𝒫, X[i],
        Dict((xi,𝒫.𝒜[i][ai]) => ψ′[i,xi,ai] for xi in X[i], ai in 𝒜[i]),
        Dict((xi,𝒫.𝒜[i][ai],𝒫.𝒪[i][oi],xi′) => η′[i,xi,ai,oi,xi′] 
            for xi in X[i], ai in 𝒜[i], oi in 𝒪[i], xi′ in X[i])) for i in ℐ]
end

总结

最后一部分可以看作是一个全文总结的案例,也可以看作是全文内容的升华(从单智能体到多智能体)。

相关文章
|
机器学习/深度学习 编解码 数据可视化
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(三)
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(三)
87 0
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(三)
|
机器学习/深度学习
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(一)
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(一)
111 0
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(一)
|
机器学习/深度学习 编解码
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(二)
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(二)
127 0
《Progressive Growing of GANs for Improved Quality, Stability, and Variation》-论文阅读笔记(二)
|
Python
【读书笔记】Algorithms for Decision Making(2)
理性决策需要对不确定性和目标进行推理。不确定性源于预测未来事件能力的实际及理论限制。为了实现其目标,一个强有力的决策系统必须考虑到当前世界状况和未来事件中的各种不确定性来源。
119 0
【读书笔记】Algorithms for Decision Making(2)
|
机器学习/深度学习 算法 流计算
【读书笔记】Algorithms for Decision Making(6)
对于较大状态空间的问题,计算精确解需要极大的内存量,因而考虑近似解的方法。常使用approximate dynamic programming的方法去寻求近似解,进而使用在线方法实现实时计算。
163 0
【读书笔记】Algorithms for Decision Making(6)
|
机器学习/深度学习 API
【读书笔记】Algorithms for Decision Making(8)
解决存在模型不确定性的此类问题是强化学习领域的主题,这是这部分的重点。解决模型不确定性的几个挑战:首先,智能体必须仔细平衡环境探索和利用通过经验获得的知识。第二,在做出重要决策后很长时间内,可能会收到奖励,因此必须将以后奖励的学分分配给以前的决策。第三,智能体必须从有限的经验中进行概括。
211 0
【读书笔记】Algorithms for Decision Making(8)
|
机器学习/深度学习 人工智能 算法
【读书笔记】Algorithms for Decision Making(1)
我自己的粗浅看法:机器学习要不是拟合逼近(经常提及的machine learning),要不就是决策过程(reinforcement learning),这本书主要讲述后者的前世今生。
331 0
【读书笔记】Algorithms for Decision Making(1)
|
算法 关系型数据库 数据建模
【读书笔记】Algorithms for Decision Making(4)
本部分讨论从数据学习或拟合模型参数的问题,进一步讨论了从数据中学习模型结构的方法,最后对决策理论进行了简单的概述。
100 0
【读书笔记】Algorithms for Decision Making(4)
|
机器学习/深度学习 算法 vr&ar
【读书笔记】Algorithms for Decision Making(9)
与基于模型的方法相比,无模型方法不需要构建转移函数和奖励函数的显性表示,而是直接作用于值函数建模。进一步地,考虑模拟学习来重建奖励函数。
【读书笔记】Algorithms for Decision Making(9)
|
算法 机器人
【读书笔记】Algorithms for Decision Making(10)
在这一部分将不确定性扩展到状态。具体讲,接收到的观测值与状态只有概率关系,而不是精确地观察状态。此类问题可以建模为部分可观察的马尔可夫决策过程(POMDP),但POMDP很难以最佳方式解决所有问题,因而需要引入更多的近似策略。
168 0