Still skeptical whether advantage buffer

josiahls · Feb 4, 2024 · dd04f5f · dd04f5f
1 parent 41a59f8
commit dd04f5f
Show file tree

Hide file tree

Showing 4 changed files with 1,848 additions and 215 deletions.
diff --git a/fastrl/envs/continuous_debug_env.py b/fastrl/envs/continuous_debug_env.py
@@ -34,9 +34,9 @@ def __init__(self, goal_position=None, proximity_threshold=0.5):
         self.state = None
 
     def step(self, action):
-        self.state += action[0]  # Assuming action is a NumPy array, use the first element
+        self.state[0] += action[0]  # Assuming action is a NumPy array, use the first element
 
-        distance_to_goal = np.abs(self.state - self.goal_position)
+        distance_to_goal = np.abs(self.state[0] - self.goal_position)
         reward = -distance_to_goal.item()  # Ensure reward is a float
 
         done = distance_to_goal <= self.proximity_threshold
@@ -48,10 +48,10 @@ def step(self, action):
 
     def reset(self, seed=None, options=None):
         super().reset(seed=seed)  # Call the superclass reset, which handles the seeding
-
-        self.state = np.array([0.0], dtype=np.float32)
         if self.goal_position is None:
             self.goal_position = np.random.uniform(-10, 10)
+        # The state is {current position, goal position}
+        self.state = np.array([0.0, self.goal_position], dtype=np.float32)
 
         return self.state, {}  # Return observation and an empty info dictionary