当前位置: 首页>>代码示例>>Python>>正文


Python FunctionSet.q_value方法代码示例

本文整理汇总了Python中chainer.FunctionSet.q_value方法的典型用法代码示例。如果您正苦于以下问题:Python FunctionSet.q_value方法的具体用法?Python FunctionSet.q_value怎么用?Python FunctionSet.q_value使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在chainer.FunctionSet的用法示例。


在下文中一共展示了FunctionSet.q_value方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Replay

# 需要导入模块: from chainer import FunctionSet [as 别名]
# 或者: from chainer.FunctionSet import q_value [as 别名]
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1 #original: 4

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        hidden_dim = 256
        self.model = FunctionSet(
            l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)),
            q_value=F.Linear(hidden_dim, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, hidden_dim),
                                               dtype=np.float32))
        )
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
#.........这里部分代码省略.........
开发者ID:masayoshi-nakamura,项目名称:lis,代码行数:103,代码来源:q_net.py

示例2: Replay

# 需要导入模块: from chainer import FunctionSet [as 别名]
# 或者: from chainer.FunctionSet import q_value [as 别名]
class DN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 1, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Breakout"

        print "Initializing DN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            l4=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l5=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)),
            l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32)),
            q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True)
        ).to_gpu()
        
        if args.resumemodel:
            # load saved model
            serializers.load_npz(args.resumemodel, self.model)
            print "load model from resume.model"
        

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        if args.resumeD1 and args.resumeD2:
            # load saved D1 and D2
            npz_tmp1 = np.load(args.resumeD1)
            print "finished loading half of D data"
            npz_tmp2 = np.load(args.resumeD2)
            self.D = [npz_tmp1['D0'],
                      npz_tmp1['D1'],
                      npz_tmp1['D2'],
                      npz_tmp2['D3'],
                      npz_tmp2['D4']]
            npz_tmp1.close()
            npz_tmp2.close()
            print "loaded stored all D data"
        else:
            self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros(self.data_size, dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.int8),
                      np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.bool)]
            print "initialized D data"

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value
        # Generate Target Signals
        tmp2 = self.Q_func(s_dash)
        tmp2 = list(map(np.argmax, tmp2.data.get()))  # argmaxQ(s',a)
        tmp = self.Q_func_target(s_dash)  # Q'(s',*)
        tmp = list(tmp.data.get())
        # select Q'(s',*) due to argmaxQ(s',a)
        res1 = []
        for i in range(num_of_batch):
            res1.append(tmp[i][tmp2[i]])

        #max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        max_Q_dash = np.asanyarray(res1, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)
        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_
        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

#.........这里部分代码省略.........
开发者ID:masataka46,项目名称:DuelingNetwork,代码行数:103,代码来源:dn_agent.py

示例3: __init__

# 需要导入模块: from chainer import FunctionSet [as 别名]
# 或者: from chainer.FunctionSet import q_value [as 别名]

#.........这里部分代码省略.........
            conv4_29_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_29_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_30_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_30_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_30_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_31_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_31_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_31_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_32_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_32_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_32_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_33_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_33_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_33_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_34_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_34_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_34_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_35_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_35_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_35_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_36_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_36_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_36_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv5_1_1=F.Convolution2D(1024,  512,  1, wscale=w, stride=2),
            conv5_1_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_1_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            conv5_1_ex=F.Convolution2D(1024,  2048,  1, wscale=w, stride=2),
            conv5_2_1=F.Convolution2D(2048,  512,  1, wscale=w, stride=1),
            conv5_2_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_2_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            conv5_3_1=F.Convolution2D(2048,  512,  1, wscale=w, stride=1),
            conv5_3_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_3_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            q_value=F.Linear(2048, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 2048),
                                               dtype=np.float32))
        )

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_
开发者ID:imenurok,项目名称:TouhouAItest,代码行数:69,代码来源:DQN.py

示例4: Replay

# 需要导入模块: from chainer import FunctionSet [as 别名]
# 或者: from chainer.FunctionSet import q_value [as 别名]
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
#	Initialization for Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)),
            l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)),
            l3=F.Linear(2592, 256),
            q_value=F.Linear(256, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32))
        ).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)
#.........这里部分代码省略.........
开发者ID:aaronzhudp,项目名称:DQN-chainer,代码行数:103,代码来源:dqn_agent_nips.py

示例5: Replay

# 需要导入模块: from chainer import FunctionSet [as 别名]
# 或者: from chainer.FunctionSet import q_value [as 别名]
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5 #10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."

        print "Model Building"
        self.CNN_model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            )

        self.model = FunctionSet(
            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
            q_value=F.Linear(512, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 512),
                                               dtype=np.float32))
        ).to_gpu()
        
        d = 'elite/'
        
        self.CNN_model.l1.W.data = np.load(d+'l1_W.npy')#.astype(np.float32)
        self.CNN_model.l1.b.data = np.load(d+'l1_b.npy')#.astype(np.float32)
        self.CNN_model.l2.W.data = np.load(d+'l2_W.npy')#.astype(np.float32)
        self.CNN_model.l2.b.data = np.load(d+'l2_b.npy')#.astype(np.float32)
        self.CNN_model.l3.W.data = np.load(d+'l3_W.npy')#.astype(np.float32)
        self.CNN_model.l3.b.data = np.load(d+'l3_b.npy')#.astype(np.float32)

        self.CNN_model = self.CNN_model.to_gpu()
        self.CNN_model_target = copy.deepcopy(self.CNN_model)
        self.model_target = copy.deepcopy(self.model)


        
        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool),
                  np.zeros((self.data_size, 1), dtype=np.uint8)]
        


    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, lstm_reward, state_dash,
                        episode_end_flag, ale_reward):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[5][data_index] = ale_reward
        else:
            self.D[0][data_index] = state
#.........这里部分代码省略.........
开发者ID:TakuTsuzuki,项目名称:Hackathon2015,代码行数:103,代码来源:imitation_learning_DQN_LSTM.py

示例6: Replay

# 需要导入模块: from chainer import FunctionSet [as 别名]
# 或者: from chainer.FunctionSet import q_value [as 别名]
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 50000  # 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10 ** 4  # Target update frequancy. original: 10^4
    data_size = 5 * (10 ** 5)  # Data size of history. original: 10^6

    field_num = 7
    field_size = 17

    def __init__(self, control_size=10, field_num=7, field_size=17):
        self.num_of_actions = control_size
        self.field_size = field_size

        # self.enable_controller = enable_controller  # Default setting : "Pong"


        print "Initializing DQN..."
        #	Initialization of Chainer 1.1.0 or older.
        # print "CUDA init"
        # cuda.init()

        self.field_num = field_num

        print "Model Building"
        self.model = FunctionSet(
                l1=F.Convolution2D(self.field_num * 4, 16, ksize=5, stride=1, nobias=False, wscale=np.sqrt(2)),
                l2=F.Convolution2D(16, 24, ksize=4, stride=1, nobias=False, wscale=np.sqrt(2)),
                l3=F.Linear(2400, 512, wscale=np.sqrt(2)),
                q_value=F.Linear(512, self.num_of_actions,
                                 initialW=np.zeros((self.num_of_actions, 512),
                                                   dtype=np.float32))
        ).to_gpu()

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        # self.optimizer.setup(self.model.collect_parameters())
        self.optimizer.setup(self.model)

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.float32),
                  np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

                # action_index = self.action_to_index(action[i])
            target[i, action[i]] = tmp_

        # TD-error clipping


        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        # td = Variable(target) - Q  # TD error


        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1)

        #print "td_data " + str(td_clip.data)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        # zero_val = Variable(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))



        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
#.........这里部分代码省略.........
开发者ID:kwrig,项目名称:SamurAI_coding_2015_16,代码行数:103,代码来源:dqn_agent_nature_gpu.py

示例7: Replay

# 需要导入模块: from chainer import FunctionSet [as 别名]
# 或者: from chainer.FunctionSet import q_value [as 别名]
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99                       # Discount factor
    initial_exploration = 5*10**4      # 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32                   # Replay (batch) size
    target_model_update_freq = 10**4   # Target update frequancy. original: 10^4
    data_size = 10**6                  # Data size of history. original: 10^6
    num_of_actions = 2                 # Action dimention
    num_of_states = 12                 # State dimention
    
    def __init__(self):
                  
        print "Initializing DQN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
#        self.model = FunctionSet(
#            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
#            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
#            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
#            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
#            q_value=F.Linear(512, self.num_of_actions,
#                             initialW=np.zeros((self.num_of_actions, 512),
#                                               dtype=np.float32))
#        ).to_gpu()
        
#        self.critic = FunctionSet(
#            l1=F.Linear(self.num_of_actions+self.num_of_states,512),
#            l2=F.Linear(512,256),
#            l3=F.Linear(256,128),
#            q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32))
#        ).to_gpu()
#        
#        self.actor = FunctionSet(
#            l1=F.Linear(self.num_of_states,512),
#            l2=F.Linear(512,256),
#            l3=F.Linear(256,128),
#            a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32))
#        ).to_gpu()
        
        self.critic = FunctionSet(
            l1=F.Linear(self.num_of_actions+self.num_of_states,1024),
            l2=F.Linear(1024,512),
            l3=F.Linear(512,256),
            l4=F.Linear(256,128),
            q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32))
        ).to_gpu()
        
        self.actor = FunctionSet(
            l1=F.Linear(self.num_of_states,1024),
            l2=F.Linear(1024,512),
            l3=F.Linear(512,256),
            l4=F.Linear(256,128),
            a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32))
        ).to_gpu()
        
#        self.critic = FunctionSet(
#            l1=F.Linear(self.num_of_actions+self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_actions+self.num_of_states)),
#            l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)),
#            l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)),
#            l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)),
#            q_value=F.Linear(128,1,wscale=0.01*math.sqrt(128))
#        ).to_gpu()
#        
#        self.actor = FunctionSet(
#            l1=F.Linear(self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_states)),
#            l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)),
#            l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)),
#            l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)),
#            a_value=F.Linear(128,self.num_of_actions,wscale=0.01*math.sqrt(128))
#        ).to_gpu()
        
        self.critic_target = copy.deepcopy(self.critic) 
        self.actor_target = copy.deepcopy(self.actor)
        
        print "Initizlizing Optimizer"
        #self.optim_critic = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001)
        #self.optim_actor = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optim_critic = optimizers.Adam(alpha=0.00001)
        self.optim_actor = optimizers.Adam(alpha=0.00001)
        self.optim_critic.setup(self.critic)
        self.optim_actor.setup(self.actor)
        
#        self.optim_critic.add_hook(chainer.optimizer.WeightDecay(0.00001))
#        self.optim_critic.add_hook(chainer.optimizer.GradientClipping(10))
#        self.optim_actor.add_hook(chainer.optimizer.WeightDecay(0.00001))
#        self.optim_actor.add_hook(chainer.optimizer.GradientClipping(10))

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.num_of_states), dtype=np.float32),
                  np.zeros((self.data_size, self.num_of_actions), dtype=np.float32),
                  np.zeros((self.data_size, 1), dtype=np.float32),
                  np.zeros((self.data_size, self.num_of_states), dtype=np.float32),
                  np.zeros((self.data_size, 1), dtype=np.bool)]
                  
#        with open('dqn_dump.json', 'a') as f:
#            json.dump(datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), f)
#            f.write('\n')
#.........这里部分代码省略.........
开发者ID:hughhugh,项目名称:dqn-vrep,代码行数:103,代码来源:agent_dqn_ddac.py


注:本文中的chainer.FunctionSet.q_value方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。