Commit 10077d25 authored by hazrmard's avatar hazrmard
Browse files

Repo state for IFAC 2020 Congress paper

parents
Pipeline #382 failed with stages
in 0 seconds
*.ini
dev.yml
.ipynb_checkpoints
__pycache__
.vscode
\ No newline at end of file
%% Cell type:code id: tags:
``` python
import sys, os
pyStateSpace_path = os.path.abspath('../pyStateSpace')
pyTorchBridge_path = os.path.abspath('../pyTorchBridge')
try:
import pystatespace
except ImportError:
if pyStateSpace_path not in sys.path:
sys.path.append(pyStateSpace_path)
try:
import pytorchbridge
except ImportError:
if pyTorchBridge_path not in sys.path:
sys.path.append(pyTorchBridge_path)
```
%% Cell type:code id: tags:
``` python
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from copy import deepcopy
from multiprocessing import Pool
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # disable tensorflow warning messages
import numpy as np
import gym
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, make_scorer
import matplotlib.pyplot as plt
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
from tqdm.auto import tqdm, trange
from pystatespace import SS_ODE, Trapezoid
from tanks import TanksFactory, TanksPhysicalEnv, TanksDataEnv, TanksDataRecurrentEnv
from plotting import plot_tanks
from utils import cache_function, cache_to_training_set, rewards_from_actions
SMALL_SIZE = 15
MEDIUM_SIZE = 17
BIGGER_SIZE = 19
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
plt.rc('lines', linewidth = 2.5)
```
%% Cell type:markdown id: tags:
## System
%% Cell type:code id: tags:
``` python
n_tanks = 6
n_engines = 2
tstep = 1e0
seed = 0
np.random.seed(seed)
nominal_config = {
'heights': np.ones(n_tanks),
'cross_section':np.ones(n_tanks),
'valves_min':np.zeros(n_tanks),
'valves_max':np.ones(n_tanks),
'resistances':np.ones(n_tanks) * 1e2,
'pumps':np.ones(n_tanks) * 0.1,
'engines':np.ones(n_engines) * 0.05
}
tanks = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)
system = Trapezoid(dims=6, outputs=6, dx=tanks.dxdt, out=tanks.y, tstep=tstep)
```
%% Cell type:code id: tags:
``` python
env_physical = DummyVecEnv([lambda: TanksPhysicalEnv(tanks, tstep=tstep)])
plot_tanks(env_physical.envs[0], plot='closed')
plt.suptitle('Fuel tank levels over time');
```
%% Cell type:markdown id: tags:
## System model
%% Cell type:code id: tags:
``` python
# Random actions which are repeated for a random number
# of time steps
def rand_u(length: int, dim: int):
i = 0
u = np.zeros((length, dim))
while i < arr_len:
subseq = min(length - i, np.random.randint(1, int(length / 2)))
u_ = np.random.choice(2, (1, dim))
u[i:i+subseq] = u_
i += subseq
return u
```
%% Cell type:code id: tags:
``` python
# Generate training data
episode_duration = sum(tanks.heights * tanks.cross_section) \
/ min(sum(tanks.pumps), sum(tanks.engines))
episode_length = int(episode_duration / system.tstep)
episodes = 50
# An episode of length n will have n-1
# state[i], action[i], state[i+1] tuples
arr_len = episode_length - 1
Xtrain = np.empty((episodes * arr_len, n_tanks * 2))
Ytrain = np.empty((episodes * arr_len, n_tanks))
for e in range(episodes):
u = rand_u(episode_length, n_tanks)
t = np.linspace(0, episode_duration, num=episode_length, endpoint=False)
x, _ = system.predict(t, tanks.heights, u)
Xtrain[e * arr_len: (e+1) * arr_len, :n_tanks] = x[:-1]
Xtrain[e * arr_len: (e+1) * arr_len, n_tanks:] = u[:-1]
Ytrain[e * arr_len: (e+1) * arr_len] = x[1:]
print(Xtrain.shape, Ytrain.shape)
```
%% Cell type:code id: tags:
``` python
grid = GridSearchCV(MLPRegressor(), scoring=make_scorer(r2_score, multioutput='uniform_average'),
param_grid={
'hidden_layer_sizes': ((64, 64), (128, 128), (256, 256), (512, 512)),
'activation': ('relu', 'logistic'),
'learning_rate_init': (1e-2, 5e-3, 1e-3),
},
n_jobs=12, verbose=1)
grid.fit(Xtrain, Ytrain)
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True, axis=0).head()
```
%% Cell type:code id: tags:
``` python
# Train on episodes-1, validate on 1 episode worth of instances:
est = grid.best_estimator_
est.set_params(random_state=seed)
# Plot performance
env_data = TanksDataEnv(tanks, est, tstep)
plot_tanks(env_data, plot='closed')
plt.suptitle('Modeled fuel tank levels over time');
```
%% Cell type:markdown id: tags:
## Degradation
%% Cell type:code id: tags:
``` python
def degrade(tanks: TanksFactory, time: float, tfactor: np.ndarray,
efactor: np.ndarray, **nominal):
if not isinstance(tfactor, (list, tuple, np.ndarray)):
# If a single degradation factor given, assume it is
# identical for all tanks.
pfactor = np.ones(n_tanks) * tfactor
if not isinstance(efactor, (list, tuple, np.ndarray)):
# If a single degradation factor given, assume it is
# identical for all engines.
efactor = np.ones(n_engines) * efactor
for i in range(n_tanks):
tanks.pumps[i] = nominal['pumps'][i] * (1 - time / tfactor[i])
tanks.resistances[i] = nominal['resistances'][i] + \
nominal['resistances'][i] * time / tfactor[i]
for i in range(n_engines):
tanks.engines[i] = nominal['engines'][i] + \
nominal['engines'][i] * time / efactor[i]
```
%% Cell type:markdown id: tags:
## Environment
%% Cell type:code id: tags:
``` python
# Reward function performance for normal vs degraded system
t = TanksFactory(n = n_tanks, e = n_engines, **nominal_config)
e = TanksPhysicalEnv(t, tstep)
tfactor = [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf]
efactor = [20., np.inf]
plt.figure(figsize=(8, 6))
u, r, d = np.zeros(6), [], False
e.reset()
while not d:
_,r_,d,_ = e.step(u)
r.append(r_)
plt.plot(r, 'c-', label='Normal')
degrade(t, 10, tfactor, efactor, **nominal_config)
u, r, d = np.zeros(6), [], False
e.reset()
while not d:
_,r_,d,_ = e.step(u)
r.append(r_)
plt.plot(r, 'r:', label='degraded')
plt.legend()
plt.title('Rewards over single episode')
plt.ylabel('Reward')
plt.xlabel('Time /s')
plt.grid(True)
plt.show()
```
%% Cell type:markdown id: tags:
## Control Loop
1. Learn nominal policy from physical model
2. During operation/Online:
1. Degrade system
2. Improve control using on-policy RL on actual system
3. Offline (more sample efficiency):
1. Learn data-driven model of system from recorded measurements
2. Improve control using on-policy RL on data-driven model
4. Goto 2
%% Cell type:code id: tags:
``` python
## Reinforcement learning
update_interval = 64
online_interval = 1024
offline_interval = 1024
minibatch = 16
policy_learning_rate = 1e-2
policy_arch=[128, 128, dict(vf=[32, 32], pi=[32, 32])]
```
%% Cell type:code id: tags:
``` python
# A constructor that generates components for a single trial based
# on global variables
def get_components():
env = DummyVecEnv([lambda: TanksPhysicalEnv(tanks, tstep=tstep)])
model = deepcopy(grid.best_estimator_)
agent = PPO2(MlpPolicy, env, verbose=0, n_steps=update_interval,
nminibatches=minibatch, learning_rate=policy_learning_rate,
policy_kwargs=dict(net_arch=policy_arch))
return env, model, agent
# A single trial which runs for periods*online_interval steps
# given a pair of degradation factors
def trial(tfactor, efactor, periods, agent, env, model, offline=True, return_caches=False):
if return_caches:
cache = []
rewards_rl, rewards_open, rewards_closed = \
np.zeros(periods), np.zeros(periods), np.zeros(periods)
for period in trange(periods):
degrade(tanks, period, tfactor, efactor, **nominal_config)
# Online-learning + control
agent.set_env(env)
x_cache, u_cache, d_cache, r_cache = [], [], [], []
agent.learn(total_timesteps=online_interval, seed=seed,
callback=cache_function(x_cache, u_cache, d_cache, r_cache))
# Accumulate rewards per period
rewards_rl[period] = np.sum(r_cache) / online_interval
rewards_open[period] = \
rewards_from_actions(env.envs[0], np.ones((online_interval, n_tanks)))\
/ online_interval
rewards_closed[period] = \
rewards_from_actions(env.envs[0], np.zeros((online_interval, n_tanks)))\
/ online_interval
if return_caches:
cache.append((x_cache, u_cache, d_cache, r_cache))
# Offline learning
if offline:
# Learn model
xu, x_next = cache_to_training_set(x_cache, u_cache, d_cache)
model.fit(xu, x_next)
# Set data-driven environment
agent.set_env(DummyVecEnv([lambda: TanksDataEnv(tanks, model, tstep)]))
# Learn optimal control
agent.learn(total_timesteps=offline_interval, seed=seed)
if return_caches:
return caches
return rewards_rl, rewards_open, rewards_closed
# Run a trial multiple times where degradation factors are chosen randomly from
# a range. The number of degrading tanks is also random (<= atmost)
def multiple_trials(n_trials, periods=10, tfactors=(10, 20), efactors=(10, 20),
atmost_tanks=2, atmost_engines=2, offline=True):
np.random.seed(seed)
random = np.random.RandomState(seed)
r_rl, r_open, r_closed = [], [], []
for _ in trange(n_trials):
env, model, agent = get_components()
tfactor = np.ones(n_tanks) * np.inf
if atmost_tanks > 0:
tanks_affected = random.randint(1, atmost_tanks + 1)
idx_affected = random.choice(n_tanks, size=tanks_affected, replace=False)
tfactor[idx_affected] = random.randint(*tfactors, size=tanks_affected)
efactor = random.randint(*efactors, size=n_engines)
if atmost_engines > 0:
engines_affected = random.choice(1, atmost_engines + 1)
idx_affected = random.choice(n_engines, size=engines_affected, replace=False)
tfactor[idx_affected] = random.randint(*efactors, size=engines_affected)
rewards_rl, rewards_open, rewards_closed = trial(tfactor, efactor, periods,
agent, env, model, offline)
r_rl.append(rewards_rl)
r_open.append(rewards_open)
r_closed.append(rewards_closed)
mean_rl = np.mean(r_rl, axis=0)
mean_open = np.mean(r_open, axis=0)
mean_closed = np.mean(r_closed, axis=0)
std_rl = np.std(r_rl, axis=0)
std_open = np.std(r_open, axis=0)
std_closed = np.std(r_closed, axis=0)
return mean_rl, mean_open, mean_closed,\
std_rl, std_open, std_closed
```
%% Cell type:code id: tags:
``` python
offline_res = {True: (None, None, None), False: (None, None, None)}
```
%% Cell type:code id: tags:
``` python
# Plot single trial rewards
periods = 10
tfactor = [np.inf, np.inf, np.inf, np.inf, 20, np.inf]
efactor = [20., np.inf]
offline = False
env, model, agent = get_components()
offline_res[offline] = trial(tfactor, efactor, periods,
agent, env, model, offline)
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(10, 8))
if offline_res[True][1] is not None:
rewards_open, rewards_closed = offline_res[True][1], offline_res[True][2]
else:
rewards_open, rewards_closed = offline_res[False][1], offline_res[False][2]
rewards_rl_offline, rewards_rl_online = offline_res[True][0], offline_res[False][0]
plt.plot(rewards_open, '--', label='Open')
plt.plot(rewards_closed, ':', label='Closed')
if rewards_rl_offline is not None:
plt.plot(rewards_rl_offline, 'g-', label='RL, Offline=True')
if rewards_rl_online is not None:
plt.plot(rewards_rl_online, 'g-.', label='RL, Offline=False')
plt.legend()
plt.grid(True, which='both')
plt.ylabel('Rewards per step')
plt.xlabel('Time /intervals')
plt.title('Accumulated rewards over degradation')
plt.show()
```
%% Cell type:code id: tags:
``` python
degrade(tanks, periods, tfactor, efactor, **nominal_config)
plot_tanks(env.envs[0], agent)
plt.suptitle('Control under degradation at interval {}, offline={}'.format(periods, offline));
```
%% Cell type:code id: tags:
``` python
offline_res_agg = {True: (None,)*6, False: (None,)*6}
```
%% Cell type:code id: tags:
``` python
# Plot multiple trial rewards
periods = 10
tfactors = (10, 30)
efactors = (10, 30)
atmost_tanks = 1
atmost_engines = 1
offline = True
offline_res_agg[offline] = \
multiple_trials(20, periods, tfactors, efactors, atmost, offline)
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(10, 8))
rewards_open, rewards_closed = offline_res_agg[offline][1], offline_res_agg[offline][2]
std_open, std_closed = offline_res_agg[offline][4], offline_res_agg[offline][5]
rewards_rl_offline, rewards_rl_online = offline_res_agg[True][0], offline_res_agg[False][0]
std_rl_offline, std_rl_online = offline_res_agg[True][3], offline_res_agg[False][3]
plt.plot(rewards_open, '--', label='Open')
plt.plot(rewards_closed, ':', label='Closed')
if rewards_rl_offline is not None:
plt.plot(rewards_rl_offline, 'g-', label='RL, Offline=True')
plt.fill_between(np.arange(periods), rewards_rl_offline + std_rl_offline,
rewards_rl_offline - std_rl_offline,
alpha=0.3, color='g')
if rewards_rl_online is not None:
plt.plot(rewards_rl_online, 'g-.', label='RL, Offline=False')
plt.fill_between(np.arange(periods), rewards_rl_online + std_rl_online,
rewards_rl_online - std_rl_online,
alpha=0.3, color='g')
plt.gca().set_prop_cycle(None)
plt.fill_between(np.arange(periods), rewards_open + std_open, rewards_open - std_open, alpha=0.3)
plt.fill_between(np.arange(periods), rewards_closed + std_closed, rewards_closed - std_closed, alpha=0.3)
plt.legend()
plt.grid(True, which='both')
plt.ylabel('Rewards per step')
plt.xlabel('Time /intervals')
plt.title('Average accumulated rewards over degradation')
plt.show()
```
# Airplane Fault Tolerance
Using reinforcement learning to control hybrid systems under degradation. This repository contains code for the following papers:
1. Ahmed, I., Quiñones-Grueiro, M. and Biswas, G. (2020). Fault-Tolerant Control of Degrading Systems with On-Policy Reinforcement Learning. IFAC-PapersOnLine (under review).
## Project structure
* `IFAC Congress 2020`: Jupyter notebook containing code and results for experiments in IFAC 2020 paper.
* `tanks.py`: Definitions of the fuel tanks model and OpenAI `gym` environment classes for use in reinforcement learning.
* `utils.py`: Some function used in the notebook for data transformation, not relevant to theory.
* `plotting.py`: Functions for plotting graphs.
* `environment.yml`: Anaconda environment file for running the notebook.
* `dev.yml`: Environment file I used to write this code. It does not install additional libraries I authored, instead I manually specified their locations on disk.
## Usage
This repository depends on [Anaconda](https://docs.conda.io/en/latest/miniconda.html) to manage dependencies.
1. Install dependencies
```
conda env create -f environment.yml
```
2. Activate environment
```
conda activate ifac
```
3. Run notebooks
```
jupyter notebook
```
\ No newline at end of file
name: rl
channels:
- conda-forge
dependencies:
- python=3.7
- scikit-learn
- scipy
- matplotlib
- numpy<1.17
- anaconda::tensorflow=1.14
- pytorch::pytorch=1.2
- pytorch::cudatoolkit=10.0
- jupyter
- pandas
- tqdm
- pylint
- pip
- pip:
- https://git.isis.vanderbilt.edu/ahmedi/pyTorchBridge/-/archive/master/pyTorchBridge-master.tar.gz
- https://git.isis.vanderbilt.edu/ahmedi/pyStateSpace/-/archive/master/pyStateSpace-master.tar.gz
- gym
- gast==0.2.2
- stable-baselines[mpi]==2.8
"""
Plotting functions for jupyter notebook.
"""
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
def plot_tanks(env, agent=None, plot='both'):
n_tanks = len(env.tanks.heights)
if agent is not None:
x, u, done = [], [], False
x.append(env.reset())
while not done:
u_, _ = agent.predict(x[-1])
x_, _, done, _ = env.step(u_)
x.append(x_)
u.append(u_)
x, u = np.asarray(x), np.asarray(u)
opened = u == 1
episode_length = len(x)
episode_duration = episode_length * env.tstep
else:
episode_duration = sum(env.tanks.heights * env.tanks.cross_section)\
/ min(sum(env.tanks.pumps), sum(env.tanks.engines))
episode_length = int(episode_duration / env.tstep)
if plot in ('closed', 'both'):
u_closed = np.zeros((episode_length, n_tanks))
x_closed = np.zeros_like(u_closed)
env.reset()
for i in range(len(u_closed)):
x_closed[i] = env.step(u_closed[i])[0]
if plot in ('open', 'both'):
u_open = np.ones((episode_length, n_tanks))
x_open = np.zeros_like(u_open)
env.reset()
for i in range(len(u_open)):
x_open[i] = env.step(u_open[i])[0]
plt.figure(figsize=(12, 12))
patches = None
for i in range(n_tanks):
plt.subplot(n_tanks // 2, 2, i+1)
plt.ylim(0, 1.05 * max(env.tanks.heights))
if plot in ('open', 'both'):
plt.plot(x_open[:, i], '--', label='Open' if i==n_tanks-1 else None)
if plot in ('closed', 'both'):
plt.plot(x_closed[:, i], ':', label='Closed' if i==n_tanks-1 else None)
if agent is not None:
cmap = plt.cm.gray
im = plt.imshow(opened[:, i].reshape(1, -1), aspect='auto', alpha=0.3,
extent=(*plt.xlim(), *plt.ylim()), origin='lower',
vmin=0, vmax=1, cmap=cmap)
if len(np.unique(opened) == 2):
colors = [ im.cmap(im.norm(value)) for value in (0, 1)]
patches = [mpatches.Patch(color=colors[0], label="Closed", alpha=0.3),
mpatches.Patch(color=colors[1], label="Opened", alpha=0.3),]
plt.plot(x[:, i], '-', label='RL' if i==n_tanks-1 else None)
plt.ylabel('Tank ' + str(i + 1))
if i >= 4: plt.xlabel('Time /s')
if (i == n_tanks-2) and patches is not None: plt.legend(handles=patches)
if i==n_tanks-1: plt.legend()
plt.grid(True)
"""
Defines the model for a fuel tank system for an airplane.
"""
from typing import Tuple
import numpy as np
from sklearn.base import BaseEstimator
import gym
from pystatespace import Trapezoid, SS_ODE