“PPO is a reinforcement learning algorithm that helps an agent learn better actions over time while ensuring each learning step is small and safe.“
Example : Mini RLHF + PPO We create a simple “Reward Model” like this:
→ The closer the answer is to 7 → The more rewards you get
Step 1 : Install the library
!pip install stable-baselines3 gym --quiet

Step 2 : Create an Environment and Reward Model
import gym
from gym import spaces
import numpy as np
import torch
class NumberGuessEnv(gym.Env):
"""
Simple Env: Model trying guess number between 0-9
if near 7 then high reward
"""
def __init__(self):
super(NumberGuessEnv, self).__init__()
self.action_space = spaces.Discrete(10) # select 0-9
self.observation_space = spaces.Discrete(1) # dummy observation
self.state = 0
def reset(self):
return self.state
def step(self, action):
reward = self.reward_model(action)
done = True # end on 1 step
return self.state, reward, done, {}
def reward_model(self, action):
""" Reward: Numbers close to 7 get a lot of points. """
return 1.0 - (abs(action -7) / 10)
env = NumberGuessEnv()

Step 3 : Install shimmy (pip install ‘shimmy>=2.0’) to support OpenAI Gym environment.”
!pip install "shimmy>=2.0" --quiet
Step 4 : Create PPO Agent and Train ( waiting around 5 min for 10,000 train )
from stable_baselines3 import PPO
# small Neural Network with PPO
model = PPO(
"MlpPolicy",
env,
verbose=1,
learning_rate=0.01,
n_steps=5,
batch_size=5,
ent_coef=0.01
)
# Train 10000
model.learn(total_timesteps=10000)

Step 5 : Test agent and try again for result will near 7
# Testing agent
start = env.reset()
total_reward = 0
print(" Agent answer: ")
for _ in range(10):
action, _states = model.predict(start, deterministic=True)
print(action)
start, reward, done, info = env.step(action)
total_reward += reward
if done:
break
print(f"Total reward: {total_reward}")

For example coding on https://colab.research.google.com/drive/1xwft2i2zcbesO9deBhQ44wQeC7Q3OIvr?usp=sharing
Thank you.