ทำความเข้าใจเกี่ยวกับนโยบายการไล่ระดับสี

ฉันพยายามที่จะสร้างตัวอย่างง่ายๆนโยบายการไล่โทนสีจากมันทรัพยากรกำเนิดAndrej Karpathy บล็อก ในบทความนั้นคุณจะพบตัวอย่างกับ CartPole และ Policy Gradient พร้อมรายการน้ำหนักและการเปิดใช้งาน Softmax นี่คือตัวอย่างการสร้างและง่ายมากของฉัน CartPole ลาดนโยบายซึ่งทำงานที่สมบูรณ์แบบ

import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
import copy

NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99


# noinspection PyMethodMayBeStatic
class Agent:
    def __init__(self):
        self.poly = PolynomialFeatures(1)
        self.w = np.random.rand(5, 2)

    def policy(self, state):
        z = state.dot(self.w)
        exp = np.exp(z)
        return exp/np.sum(exp)

    def __softmax_grad(self, softmax):
        s = softmax.reshape(-1,1)
        return np.diagflat(s) - np.dot(s, s.T)

    def grad(self, probs, action, state):
        dsoftmax = self.__softmax_grad(probs)[action,:]
        dlog = dsoftmax / probs[0,action]
        grad = state.T.dot(dlog[None,:])
        return grad

    def update_with(self, grads, rewards):

        for i in range(len(grads)):
            # Loop through everything that happend in the episode
            # and update towards the log policy gradient times **FUTURE** reward

            total_grad_effect = 0
            for t, r in enumerate(rewards[i:]):
                total_grad_effect += r * (GAMMA ** r)
            self.w += LEARNING_RATE * grads[i] * total_grad_effect
            print("Grads update: " + str(np.sum(grads[i])))



def main(argv):
    env = gym.make('CartPole-v0')
    np.random.seed(1)

    agent = Agent()
    complete_scores = []

    for e in range(NUM_EPISODES):
        state = env.reset()[None, :]
        state = agent.poly.fit_transform(state)

        rewards = []
        grads = []
        score = 0

        while True:

            probs = agent.policy(state)
            action_space = env.action_space.n
            action = np.random.choice(action_space, p=probs[0])

            next_state, reward, done,_ = env.step(action)
            next_state = next_state[None,:]
            next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
            grad = agent.grad(probs, action, state)

            grads.append(grad)
            rewards.append(reward)

            score += reward
            state = next_state

            if done:
                break

        agent.update_with(grads, rewards)
        complete_scores.append(score)

    env.close()
    plt.plot(np.arange(NUM_EPISODES),
             complete_scores)
    plt.savefig('image1.png')


if __name__ == '__main__':
    main(None)

คำถาม

ฉันพยายามที่จะทำเกือบเป็นตัวอย่างเดียวกัน แต่ด้วยการเปิดใช้งาน Sigmoid (เพื่อความเรียบง่าย) นั่นคือทั้งหมดที่ฉันต้องทำ สวิทช์เปิดใช้งานในรูปแบบจากไปsoftmax sigmoidซึ่งควรใช้งานได้อย่างแน่นอน (ตามคำอธิบายด้านล่าง) แต่รูปแบบการไล่ระดับสีนโยบายของฉันไม่ได้เรียนรู้อะไรเลยและมีการสุ่ม ข้อเสนอแนะใด ๆ

import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99


# noinspection PyMethodMayBeStatic
class Agent:
    def __init__(self):
        self.poly = PolynomialFeatures(1)
        self.w = np.random.rand(5, 1) - 0.5

    # Our policy that maps state to action parameterized by w
    # noinspection PyShadowingNames
    def policy(self, state):
        z = np.sum(state.dot(self.w))
        return self.sigmoid(z)

    def sigmoid(self, x):
        s = 1 / (1 + np.exp(-x))
        return s

    def sigmoid_grad(self, sig_x):
        return sig_x * (1 - sig_x)

    def grad(self, probs, action, state):
        dsoftmax = self.sigmoid_grad(probs)
        dlog = dsoftmax / probs
        grad = state.T.dot(dlog)
        grad = grad.reshape(5, 1)
        return grad

    def update_with(self, grads, rewards):
        if len(grads) < 50:
            return
        for i in range(len(grads)):
            # Loop through everything that happened in the episode
            # and update towards the log policy gradient times **FUTURE** reward

            total_grad_effect = 0
            for t, r in enumerate(rewards[i:]):
                total_grad_effect += r * (GAMMA ** r)
            self.w += LEARNING_RATE * grads[i] * total_grad_effect


def main(argv):
    env = gym.make('CartPole-v0')
    np.random.seed(1)

    agent = Agent()
    complete_scores = []

    for e in range(NUM_EPISODES):
        state = env.reset()[None, :]
        state = agent.poly.fit_transform(state)

        rewards = []
        grads = []
        score = 0

        while True:

            probs = agent.policy(state)
            action_space = env.action_space.n
            action = np.random.choice(action_space, p=[1 - probs, probs])

            next_state, reward, done, _ = env.step(action)
            next_state = next_state[None, :]
            next_state = agent.poly.fit_transform(next_state.reshape(1, 4))

            grad = agent.grad(probs, action, state)
            grads.append(grad)
            rewards.append(reward)

            score += reward
            state = next_state

            if done:
                break

        agent.update_with(grads, rewards)
        complete_scores.append(score)

    env.close()
    plt.plot(np.arange(NUM_EPISODES),
             complete_scores)
    plt.savefig('image1.png')


if __name__ == '__main__':
    main(None)

พล็อตการเรียนรู้ทั้งหมดจะสุ่ม ไม่มีสิ่งใดช่วยปรับจูนพารามิเตอร์ไฮเปอร์ ด้านล่างภาพตัวอย่าง

การอ้างอิง :

1) การเรียนรู้การเสริมแรงลึก: โป่งจากพิกเซล

2) ความรู้เบื้องต้นเกี่ยวกับการไล่ระดับสีของนโยบายด้วย Cartpole และ Doom

3) การไล่ระดับนโยบายที่ได้มาและการดำเนินการตามนโยบาย

4) เทคนิคการเรียนรู้เคล็ดลับประจำวัน (5): บันทึกเคล็ดลับอนุพันธ์ 12

UPDATE

ดูเหมือนว่าคำตอบด้านล่างสามารถทำงานได้จากกราฟิก แต่ไม่ใช่บันทึกความน่าจะเป็นและไม่ใช่การไล่ระดับสีของนโยบาย และเปลี่ยนแปลงวัตถุประสงค์โดยรวมของนโยบาย RL Gradient โปรดตรวจสอบข้อมูลอ้างอิงด้านบน ต่อไปนี้ภาพเราคำสั่งต่อไป

ฉันต้องใช้ฟังก์ชันไล่ระดับสีของนโยบายของฉัน (ซึ่งเป็นเพียงน้ำหนักและการsigmoidเปิดใช้งาน)

— GensaGames
แหล่งที่มา

ฉันขอแนะนำให้คุณโพสต์คำถามนี้ในData Science Stack Exchangeเพราะส่วนใหญ่เป็นคำถามเชิงทฤษฎี คุณจะเข้าถึงผู้คนจำนวนมากที่มีความรู้ในโดเมนนี้

— Gilles-Philippe Paillé

@ Gilles-PhilippePailléฉันเพิ่มรหัสซึ่งเป็นตัวแทนของปัญหา สิ่งที่ฉันต้องทำก็แค่แก้ไขบางส่วนด้วยการเปิดใช้งาน โปรดตรวจสอบคำตอบที่อัพเดต

— GensaGames

จะได้รับนโยบายการไล่ระดับสีที่นี่เป็นบทความอ้างอิงกับตัวอย่างการทำงานประเภทเดียวกันของการเตรียมการ, หวังว่าคุณจะได้เรียนรู้ในรายละเอียด: medium.com/@thechrisyoon/...

— มูฮัมหมัด Usman

@MuhammadUsman ขอบคุณสำหรับข้อมูล ฉันแดงที่มา ตอนนี้ก็เป็นที่ชัดเจนและรูปแบบตัวอย่างข้างต้นผมพยายามที่จะเปิดใช้งานเปลี่ยนจากการsoftmax signmoidนั่นเป็นเพียงสิ่งเดียวที่ฉันต้องทำในตัวอย่างข้างต้น

— GensaGames

@JasonChia sigmoid ส่งออกจำนวนจริงในช่วง[0, 1]ที่สามารถตีความได้ว่าเป็นความน่าจะเป็นของการกระทำในเชิงบวก (เช่นเลี้ยวขวาใน CartPole เป็นต้น) จากนั้นน่าจะเป็นของการดำเนินการลบ (เลี้ยวซ้าย) 1 - sigmoidเป็น ผลรวมของความน่าจะเป็นนี้คือ 1 ใช่นี่เป็นสภาพแวดล้อมของการ์ดโพล

— Pavel Tyshevskyi

ปัญหาเกิดขึ้นจากgradวิธีการ

def grad(self, probs, action, state):
    dsoftmax = self.sigmoid_grad(probs)
    dlog = dsoftmax / probs
    grad = state.T.dot(dlog)
    grad = grad.reshape(5, 1)
    return grad

ในรหัสดั้งเดิม Softmax ถูกนำมาใช้พร้อมกับฟังก์ชั่นการสูญเสีย CrossEntropy เมื่อคุณสลับการเปิดใช้งานเป็น Sigmoid ฟังก์ชันการสูญเสียที่เหมาะสมจะกลายเป็น Binary CrossEntropy ตอนนี้วัตถุประสงค์ของgradวิธีการคือการคำนวณการไล่ระดับสีของฟังก์ชั่นการสูญเสีย wrt น้ำหนัก เจียดรายละเอียดการไล่ระดับสีที่เหมาะสมจะได้รับโดย(probs - action) * stateในคำศัพท์ของโปรแกรมของคุณ สิ่งสุดท้ายคือการเพิ่มเครื่องหมายลบ - เราต้องการเพิ่มค่าลบของฟังก์ชันการสูญเสียให้ได้มากที่สุด

gradวิธีการที่เหมาะสมจึง:

def grad(self, probs, action, state):
    grad = state.T.dot(probs - action)
    return -grad

การเปลี่ยนแปลงอื่นที่คุณอาจต้องการเพิ่มคือการเพิ่มอัตราการเรียนรู้ LEARNING_RATE = 0.0001และNUM_EPISODES = 5000จะสร้างพล็อตต่อไปนี้:

การบรรจบกันนั้นจะเร็วขึ้นมากหากมีการเริ่มต้นน้ำหนักโดยใช้การกระจายแบบเกาส์ด้วยค่าเฉลี่ยศูนย์และความแปรปรวนเล็กน้อย:

def __init__(self):
    self.poly = PolynomialFeatures(1)
    self.w = np.random.randn(5, 1) * 0.01

UPDATE

เพิ่มรหัสที่สมบูรณ์เพื่อสร้างผลลัพธ์:

import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

NUM_EPISODES = 5000
LEARNING_RATE = 0.0001
GAMMA = 0.99


# noinspection PyMethodMayBeStatic
class Agent:
    def __init__(self):
        self.poly = PolynomialFeatures(1)
        self.w = np.random.randn(5, 1) * 0.01

    # Our policy that maps state to action parameterized by w
    # noinspection PyShadowingNames
    def policy(self, state):
        z = np.sum(state.dot(self.w))
        return self.sigmoid(z)

    def sigmoid(self, x):
        s = 1 / (1 + np.exp(-x))
        return s

    def sigmoid_grad(self, sig_x):
        return sig_x * (1 - sig_x)

    def grad(self, probs, action, state):
        grad = state.T.dot(probs - action)
        return -grad

    def update_with(self, grads, rewards):
        if len(grads) < 50:
            return
        for i in range(len(grads)):
            # Loop through everything that happened in the episode
            # and update towards the log policy gradient times **FUTURE** reward

            total_grad_effect = 0
            for t, r in enumerate(rewards[i:]):
                total_grad_effect += r * (GAMMA ** r)
            self.w += LEARNING_RATE * grads[i] * total_grad_effect


def main(argv):
    env = gym.make('CartPole-v0')
    np.random.seed(1)

    agent = Agent()
    complete_scores = []

    for e in range(NUM_EPISODES):
        state = env.reset()[None, :]
        state = agent.poly.fit_transform(state)

        rewards = []
        grads = []
        score = 0

        while True:

            probs = agent.policy(state)
            action_space = env.action_space.n
            action = np.random.choice(action_space, p=[1 - probs, probs])

            next_state, reward, done, _ = env.step(action)
            next_state = next_state[None, :]
            next_state = agent.poly.fit_transform(next_state.reshape(1, 4))

            grad = agent.grad(probs, action, state)
            grads.append(grad)
            rewards.append(reward)

            score += reward
            state = next_state

            if done:
                break

        agent.update_with(grads, rewards)
        complete_scores.append(score)

    env.close()
    plt.plot(np.arange(NUM_EPISODES),
             complete_scores)
    plt.savefig('image1.png')


if __name__ == '__main__':
    main(None)

— Pavel Tyshevskyi
แหล่งที่มา

ขอบคุณมาก. ฉันจะลองวิธีนี้ในภายหลัง

— GensaGames

ฉันไม่แน่ใจว่าคุณมาจากที่ใดสำหรับหน้าที่ของฉัน ในขณะที่คุณสามารถตรวจสอบภาพด้านบน ฉันจะต้องใช้การไล่ระดับสีของบันทึกของนโยบาย sigmoidที่มีนโยบายในกรณีของฉันเพียงแค่น้ำหนักด้วย แต่การไล่ระดับของคุณในคำตอบไม่ควรเกี่ยวกับการไล่ระดับสี ขวา?

— GensaGames

โปรดสังเกตว่าคุณไม่ได้รวมข้อมูลใด ๆ เกี่ยวกับการกระทำที่กระทำ ตามการบรรยายเรื่องการไล่ระดับสีนโยบาย (สไลด์ 13) การอัปเดตควรมีลักษณะเช่น(action - probs) * sigmoid_grad(probs)นี้ แต่ฉันถูกละเว้นsigmoid_gradเนื่องจากปัญหาการหายไปของการไล่ระดับ sigmoid

— Pavel Tyshevskyi

ปมที่นี่คือการระบุทิศทางที่เราต้องการเปลี่ยนน้ำหนัก หากaction = 1เราต้องการprobsใกล้ชิด1เพิ่มน้ำหนัก (ไล่ระดับสีบวก) หากaction=0เราต้องการprobsใกล้ชิดยิ่งขึ้น0ดังนั้นการลดน้ำหนัก (การไล่ระดับสีเชิงลบ)

— Pavel Tyshevskyi

ไม่ว่าในกรณีใดการเปลี่ยนแปลงข้างต้นจะไม่ทำงานเลยคุณสามารถแชร์ไฟล์ทั้งหมดได้หรือไม่ ในเวลาเดียวกันฉันต้องการทำให้ตัวอย่างชัดเจนและไม่สนใจปัญหาการหายตัวไปในกรณีนี้ และ(action - probs)มันก็เป็นอีกวิธีหนึ่งในการเปลี่ยนชุดเดิม

— GensaGames