五、多智能体系统(1)
现将单智能体的核心概念扩展到多智能体系统的问题。在该系统中,可将其他智能体建模为潜在的盟友或对手,并随着时间的推移进行相应的调整。
1. 多智能体推理
Multiagent reasoning本质上是一个博弈论1的课题。
Myerson, Game Theory: Analysis of Conflict. Harvard University Press, 1997. [3] Y. Shoham and K. Leyton Brown, Multiagent Systems: Algorithmic, Game Theoretic, and LogicalFoundations. Cambridge University Press, 2009.
1.1 简单博弈
简单博弈包括下述几部分:
struct SimpleGame
γ # discount factor
ℐ # agents
𝒜 # joint action space
R # joint reward function
end
具体来讲,每个智能体$i \in \mathcal{I}$选择一个行动$a^{i} \in \mathcal{A}^{i}$去最大化自己的累积奖励$r^{i} \in R^{i}$。
- 联合行动空间$\mathcal{A} = \mathcal{A}^{1} \times \cdots \times \mathcal{A}^{k}$包含多智能体系统的所有可能行动。
- 联合行动$\bm{a} =\left(a^{1} ,\cdots, a^{k} \right) \in \mathcal{A}$。
- 对应于联合行动的联合奖励函数$R(\bm{a})$。
联合策略$\bm{\pi}$指定智能体采取的联合行动的概率分布。在博弈论中,确定性策略称为纯策略,随机策略称为混合策略。从智能体$i$的角度来看,联合策略$\pi$的效用是$$\mathcal{U}^{i}(\bm{\pi}) = \sum_{\bm{a} \in \mathcal{A}} R^{i}(\bm{a}) \prod_{j \in \mathcal{I}} \pi^{j}(a^{j}).$$
struct SimpleGamePolicy
p # dictionary mapping actions to probabilities
function SimpleGamePolicy(p::Base.Generator)
return SimpleGamePolicy(Dict(p))
end
function SimpleGamePolicy(p::Dict)
vs = collect(values(p))
vs ./= sum(vs)
return new(Dict(k => v for (k,v) in zip(keys(p), vs)))
end
SimpleGamePolicy(ai) = new(Dict(ai => 1.0))
end
(πi::SimpleGamePolicy)(ai) = get(πi.p, ai, 0.0)
function (πi::SimpleGamePolicy)()
D = SetCategorical(collect(keys(πi.p)), collect(values(πi.p)))
return rand(D)
end
joint(X) = vec(collect(product(X...)))
joint(π, πi, i) = [i == j ? πi : πj for (j, πj) in enumerate(π)]
function utility(𝒫::SimpleGame, π, i)
𝒜, R = 𝒫.𝒜, 𝒫.R
p(a) = prod(πj(aj) for (πj, aj) in zip(π, a))
return sum(R(a)[i]*p(a) for a in joint(𝒜))
end
1.2 响应模型
首先考虑在给定其他智能体的固定策略的情况下,对单个智能体$i$的响应建模。
function best_response(𝒫::SimpleGame, π, i)
U(ai) = utility(𝒫, joint(π, SimpleGamePolicy(ai), i), i)
ai = argmax(U, 𝒫.𝒜[i])
return SimpleGamePolicy(ai)
end
function softmax_response(𝒫::SimpleGame, π, i, λ)
𝒜i = 𝒫.𝒜[i]
U(ai) = utility(𝒫, joint(π, SimpleGamePolicy(ai), i), i)
return SimpleGamePolicy(ai => exp(λ*U(ai)) for ai in 𝒜i)
end
1.3 Nash均衡
找Nash均衡的过程可以视为如下的优化问题:
$$\begin{align*} \max_{\pi, \mathcal{U}} \quad & \sum_{i} \left(\mathcal{U}^{i} - \mathcal{U}^{i}(\pi)\right) \\ {\rm s.t.} \quad & \mathcal{U}^{i} \geq \mathcal{U}^{i} (a^{i}, \pi^{- i}), \ \forall i, a^{i} \\ & \sum_{a^{\prime}} \pi^{i} (a^{i}) = 1,\ \forall i \\ & \pi^{i} (a^{i}) \geq 0, \ \forall i, a^{i} \end{align*}$$
struct NashEquilibrium end
function tensorform(𝒫::SimpleGame)
ℐ, 𝒜, R = 𝒫.ℐ, 𝒫.𝒜, 𝒫.R
ℐ′ = eachindex(ℐ)
𝒜′ = [eachindex(𝒜[i]) for i in ℐ]
R′ = [R(a) for a in joint(𝒜)]
return ℐ′, 𝒜′, R′
end
function solve(M::NashEquilibrium, 𝒫::SimpleGame)
ℐ, 𝒜, R = tensorform(𝒫)
model = Model(Ipopt.Optimizer)
@variable(model, U[ℐ])
@variable(model, π[i=ℐ, 𝒜[i]] ≥ 0)
@NLobjective(model, Min,
sum(U[i] - sum(prod(π[j,a[j]] for j in ℐ) * R[y][i]
for (y,a) in enumerate(joint(𝒜))) for i in ℐ))
@NLconstraint(model, [i=ℐ, ai=𝒜[i]],
U[i] ≥ sum(
prod(j==i ? (a[j]==ai ? 1.0 : 0.0) : π[j,a[j]] for j in ℐ)
* R[y][i] for (y,a) in enumerate(joint(𝒜))))
@constraint(model, [i=ℐ], sum(π[i,ai] for ai in 𝒜[i]) == 1)
optimize!(model)
πi′(i) = SimpleGamePolicy(𝒫.𝒜[i][ai] => value(π[i,ai]) for ai in 𝒜[i])
return [πi′(i) for i in ℐ]
end
1.3.1 Correlated Equilibrium
struct CorrelatedEquilibrium end
function solve(M::CorrelatedEquilibrium, 𝒫::SimpleGame)
ℐ, 𝒜, R = 𝒫.ℐ, 𝒫.𝒜, 𝒫.R
model = Model(Ipopt.Optimizer)
@variable(model, π[joint(𝒜)] ≥ 0)
@objective(model, Max, sum(sum(π[a]*R(a) for a in joint(𝒜))))
@constraint(model, [i=ℐ, ai=𝒜[i], ai′=𝒜[i]],
sum(R(a)[i]*π[a] for a in joint(𝒜) if a[i]==ai)
≥ sum(R(joint(a,ai′,i))[i]*π[a] for a in joint(𝒜) if a[i]==ai))
@constraint(model, sum(π) == 1)
optimize!(model)
return JointCorrelatedPolicy(a => value(π[a]) for a in joint(𝒜))
end
1.4 Iterated Best Response
由于计算Nash均衡可能需要大量的计算,另一种方法是在一系列重复游戏中迭代应用最佳响应。
struct IteratedBestResponse
k_max # number of iterations
π # initial policy
end
function IteratedBestResponse(𝒫::SimpleGame, k_max)
π = [SimpleGamePolicy(ai => 1.0 for ai in 𝒜i) for 𝒜i in 𝒫.𝒜]
return IteratedBestResponse(k_max, π)
end
function solve(M::IteratedBestResponse, 𝒫)
π = M.π
for k in 1:M.k_max
π = [best_response(𝒫, π, i) for i in 𝒫.ℐ]
end
return π
end
1.4.1 Hierarchical Softmax
struct HierarchicalSoftmax
λ # precision parameter
k # level
π # initial policy
end
function HierarchicalSoftmax(𝒫::SimpleGame, λ, k)
π = [SimpleGamePolicy(ai => 1.0 for ai in 𝒜i) for 𝒜i in 𝒫.𝒜]
return HierarchicalSoftmax(λ, k, π)
end
function solve(M::HierarchicalSoftmax, 𝒫)
π = M.π
for k in 1:M.k
π = [softmax_response(𝒫, π, i, M.λ) for i in 𝒫.ℐ]
end
return π
end
1.5 Fictitious Play
计算不同智能体策略的另一种方法是让它们在模拟中相互作用,并学习如何最佳响应。
mutable struct FictitiousPlay
𝒫 # simple game
i # agent index
N # array of action count dictionaries
πi # current policy
end
function FictitiousPlay(𝒫::SimpleGame, i)
N = [Dict(aj => 1 for aj in 𝒫.𝒜[j]) for j in 𝒫.ℐ]
πi = SimpleGamePolicy(ai => 1.0 for ai in 𝒫.𝒜[i])
return FictitiousPlay(𝒫, i, N, πi)
end
(πi::FictitiousPlay)() = πi.πi()
(πi::FictitiousPlay)(ai) = πi.πi(ai)
function update!(πi::FictitiousPlay, a)
N, 𝒫, ℐ, i = πi.N, πi.𝒫, πi.𝒫.ℐ, πi.i
for (j, aj) in enumerate(a)
N[j][aj] += 1
end
p(j) = SimpleGamePolicy(aj => u/sum(values(N[j])) for (aj, u) in N[j])
π = [p(j) for j in ℐ]
πi.πi = best_response(𝒫, π, i)
end
1.6 梯度上升
mutable struct GradientAscent
𝒫 # simple game
i # agent index
t # time step
πi # current policy
end
function GradientAscent(𝒫::SimpleGame, i)
uniform() = SimpleGamePolicy(ai => 1.0 for ai in 𝒫.𝒜[i])
return GradientAscent(𝒫, i, 1, uniform())
end
(πi::GradientAscent)() = πi.πi()
(πi::GradientAscent)(ai) = πi.πi(ai)
function update!(πi::GradientAscent, a)
𝒫, ℐ, 𝒜i, i, t = πi.𝒫, πi.𝒫.ℐ, πi.𝒫.𝒜[πi.i], πi.i, πi.t
jointπ(ai) = [SimpleGamePolicy(j == i ? ai : a[j]) for j in ℐ]
r = [utility(𝒫, jointπ(ai), i) for ai in 𝒜i]
π′ = [πi.πi(ai) for ai in 𝒜i]
π = project_to_simplex(π′ + r / sqrt(t))
πi.t = t + 1
πi.πi = SimpleGamePolicy(ai => p for (ai, p) in zip(𝒜i, π))
end
- 博弈论基础书可见 [1] D. Fudenberg and J. Tirole, Game Theory. MIT Press, 1991. [2] R. B. ↩