Author: Leo Peach
Date: 22-12-2018
The initial investigation will follow a similar method to that used in James et al 2017. Initially we will focus on developing the necessary data for the MLP model (Mult-layer perceptron).
Data:
Wave Buoy from Queensland Government and Hindcast from CSIRO
The hindcast data is in daily grid files, to get data for one location we are going to access each file and grab the data we want.
import netCDF4
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
#wave buoy info
brisbane = {'name': 'Brisbane, Australia', 'lat': -27.54722, 'lon': 153.8667}
gold_coast = {'name': 'Gold Coast, Australia', 'lat': -27.9646, 'lon': 153.4404} # location in metadata incorrect
def extract_hindcast(file, wb_info):
'''Parsed a hindcast netCDF file
Returns a dataframe with the relevant data for a location'''
fh = netCDF4.Dataset(file,'r')
lons = fh.variables['longitude'][:]
lats = fh.variables['latitude'][:]
hs = fh.variables['hs'][:]
t02 = fh.variables['t02'][:]
u10 = fh.variables['U10'][:]
v10 = fh.variables['V10'][:]
time = fh.variables['time'][:]
pkdir = fh.variables['dp'][:]
pkfreq = fh.variables['fp'][:]
fh.close()
latidx = np.abs(lats - wb_info['lat']).argmin()
lonidx = np.abs(lons - wb_info['lon']).argmin()
hs = pd.Series(hs[:,latidx, lonidx])
t02 = pd.Series(t02[:,latidx, lonidx])
u10 = pd.Series(u10[:,latidx, lonidx])
v10 = pd.Series(v10[:, latidx, lonidx])
pkfreq = pd.Series(pkfreq[:,latidx, lonidx])
pkdir = pd.Series(pkdir[:,latidx, lonidx])
tstr = file[-13:-3]
start = tstr + " 00:00"
end = tstr + " 23:00"
dateidx = pd.date_range(start,end , freq='H')
data = pd.concat([hs, t02,u10, v10, pkfreq, pkdir], axis = 1)
data.index = dateidx
data.columns = ['hs','t02','u10', 'v10','pkfreq','pkdir']
return data
import os
hindcast_directory = 'D:/Masters/data/hindcast/'
gc_hc_list = []
errs = 0
for file in os.listdir(hindcast_directory):
if file.endswith('.nc'):
try:
gc_hc_list.append(extract_hindcast(os.path.join(hindcast_directory+file), gold_coast))
except:
errs +=1
continue
gc_hc = pd.concat(gc_hc_list)
Save data at this point to ensure we don't need to run this again for gold wave bouy location
gc_hc.to_csv('gold_coast_hindcast.csv')
Now that that the data is in CSV format grab it anduse it instead of re-processing
gc_hc = pd.read_csv('D:/Masters/python/machine_learning/gold_coast_hindcast.csv')
gc_hc.index = pd.to_datetime(gc_hc['Unnamed: 0'])
gc_hc.head()
first location select offshore of the wave buoy in approx -500m depth.
loc1_gc = {'name': 'Location1, Australia', 'lat': -27.9646, 'lon': 154.061}
import os
hindcast_directory = 'D:/Masters/data/hindcast/'
lc_hc1_list = []
errs = 0
for file in os.listdir(hindcast_directory):
if file.endswith('.nc'):
try:
lc_hc1_list.append(extract_hindcast(os.path.join(hindcast_directory+file), gold_coast))
except:
errs +=1
continue
print(errs)
lc1_hc = pd.concat(lc_hc1_list)
also save this as a csv
lc1_hc.to_csv('loc1_hindcast.csv')
now that we have this saved as a csv, use this instead of retrieving the data again
lc1_hc = pd.read_csv('D:/Masters/python/machine_learning/loc1_hindcast.csv')
lc1_hc.index = pd.to_datetime(gc_hc['Unnamed: 0'])
lc1_hc.head()
fig, ax = plt.subplots(figsize = (14, 6))
ax.plot(lc1_hc.index, lc1_hc.hs)
ax.plot(gc_hc.index, gc_hc.hs)
plt.show()
fig, ax = plt.subplots(figsize = (14, 6))
ax.plot(lc1_hc.index, lc1_hc.t02)
ax.plot(gc_hc.index, gc_hc.t02)
plt.show()
Observation: The data from both locations is identical, this could be due to a number of reasons, they are confirmed not to be in the same grid as they are 25km apart.
Conclusion: Although the hindcast dataset is extremely useful it will need to be upscaled to a higher resolution to be applicable for building machine learning models.
However we will need to look at other parts of the data to see how well the model has performed against the wave buoy data generally, are there trends that need to be accounted for within the data?
Plot the data in bokeh so that it is easier to interogate
from bokeh.io import output_notebook, show, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Span
from bokeh.models.tools import HoverTool
output_notebook()
p = figure(x_axis_type="datetime", plot_height = 500, plot_width = 800, title='hindcast data comparison' )
plotf =p.line(x = gc_hc.index,y = gc_hc.hs, color = 'blue',legend='Wave Buoy Site')
p.add_tools(HoverTool(renderers=[plotf], tooltips=[("Hs","@y")],mode='vline'))
plota = p.line(lc1_hc.index, lc1_hc.hs, color = 'green',legend='Offshore')
p.add_tools(HoverTool(renderers=[plota], tooltips=[("Hs","@y")],mode='vline'))
p.yaxis.axis_label = "Signifcant Wave Height (m)"
p.legend.location = "top_left"
p.legend.click_policy="hide"
show(p)
A comparison of the WB data we have and the hindcast dataset for the same location
wb_data_filename = r'D:/Masters/python/machine_learning/gold_coast_wb_cleaned.csv'
wb_data = pd.read_csv(wb_data_filename, index_col = 0, parse_dates = True)
wb_data = wb_data.sort_index()
print(wb_data.index[0], wb_data.index[-1])
hindcast_filename = r'D:/Masters/python/machine_learning/gold_coast_hindcast.csv'
hindcast_data = pd.read_csv(hindcast_filename, index_col = 0, parse_dates = True)
hindcast_data = hindcast_data.loc['1987-02-21 09:00':]
hindcast_data.index[-1]
wb_data = wb_data.loc[:'2010-12-31']
wb_data = wb_data[['Hs','Tz', 'Tp','pkdir']]
hindcast_data = hindcast_data[['hs','t02','pkfreq','pkdir']]
Convert peak frequency to peak period
hindcast_data['pkfreq'] = 1/ hindcast_data['pkfreq']
hindcast_data = hindcast_data.rename(index=str, columns={"hs": "Hs", "t02": "Tz", "pkfreq":"Tp"})
wb_data.head()
hindcast_data.head()
Now we need to interpolate the data to the same time format
wb_data_interp = wb_data.resample('1h').bfill(limit=1).interpolate()
hindcast_data.index = pd.to_datetime(hindcast_data.index)
fig, ax = plt.subplots(figsize = (14, 6))
ax.plot(hindcast_data.index, hindcast_data.Hs, label = 'hindcast')
ax.plot(wb_data_interp[1:].index, wb_data_interp[1:].Hs, label = 'wavebuoy')
plt.legend()
plt.show()
from pandas import plotting
hindcast_scatter = plotting.scatter_matrix(hindcast_data, alpha = 0.5, figsize = (14, 14))
create a combined dataframe for comparison
wb_data_interp['source'] = 'wave buoy'
hindcast_data['source'] = 'hindcast'
combined = pd.concat([wb_data_interp,hindcast_data])
import seaborn as sns
sns.set_style("white")
colours = ['windows blue', 'greyish']
sns.palplot(sns.xkcd_palette(colours))
pairplt = sns.pairplot(combined, hue="source", diag_kind = 'hist',plot_kws = {'alpha': 0.75, 's': 5, 'edgecolor': 'k',
'linewidth' :0}, palette = sns.xkcd_palette(colours))
pairplt.fig.suptitle('Gold Coast Hindcast comparison matrix (1987-2010)')
pairplt.fig.subplots_adjust(top=.9)
plt.savefig('gold_coast_comparison_matrix.png')
In order to measure the similarity between the Hindcast model and the recroded data over the period we will compare the overlapping datasets.
Correlation
wb_data_interp.Hs.corr(hindcast_data.Hs)
wb_data_interp.Tz.corr(hindcast_data.Tz)
wb_data_interp.Tp.corr(hindcast_data.Tp)
wb_data_interp.pkdir.corr(hindcast_data.pkdir)
RMSE and MAE
Best value = 0.0
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
mean_squared_error(wb_data_interp.loc[hindcast_data.index].Hs, hindcast_data.Hs)
mean_absolute_error(wb_data_interp.loc[hindcast_data.index].Hs, hindcast_data.Hs)
mean_squared_error(wb_data_interp.loc[hindcast_data.index].Tz, hindcast_data.Tz)
mean_absolute_error(wb_data_interp.loc[hindcast_data.index].Tz, hindcast_data.Tz)
mean_squared_error(wb_data_interp.loc[hindcast_data.index].Tp, hindcast_data.Tp)
mean_absolute_error(wb_data_interp.loc[hindcast_data.index].Tp, hindcast_data.Tp)
Variance
Best Value 1.0
explained_variance_score(wb_data_interp.loc[hindcast_data.index].Hs, hindcast_data.Hs)
explained_variance_score(wb_data_interp.loc[hindcast_data.index].Tz, hindcast_data.Tz)
explained_variance_score(wb_data_interp.loc[hindcast_data.index].Tp, hindcast_data.Tp)