1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
- import numpy as np
- import pandas as pd
- import os
- from collections import UserDict
- def load_data(data_dir):
- """Load the GEFCom 2014 energy load data"""
- energy = pd.read_csv(os.path.join(data_dir, 'energy.csv'), parse_dates=['timestamp'])
- # Reindex the dataframe such that the dataframe has a record for every time point
- # between the minimum and maximum timestamp in the time series. This helps to
- # identify missing time periods in the data (there are none in this dataset).
- energy.index = energy['timestamp']
- energy = energy.reindex(pd.date_range(min(energy['timestamp']),
- max(energy['timestamp']),
- freq='H'))
- energy = energy.drop('timestamp', axis=1)
- return energy
- def mape(predictions, actuals):
- """Mean absolute percentage error"""
- return ((predictions - actuals).abs() / actuals).mean()
- def create_evaluation_df(predictions, test_inputs, H, scaler):
- """Create a data frame for easy evaluation"""
- eval_df = pd.DataFrame(predictions, columns=['t+'+str(t) for t in range(1, H+1)])
- eval_df['timestamp'] = test_inputs.dataframe.index
- eval_df = pd.melt(eval_df, id_vars='timestamp', value_name='prediction', var_name='h')
- eval_df['actual'] = np.transpose(test_inputs['target']).ravel()
- eval_df[['prediction', 'actual']] = scaler.inverse_transform(eval_df[['prediction', 'actual']])
- return eval_df
- class TimeSeriesTensor(UserDict):
- """A dictionary of tensors for input into the RNN model.
-
- Use this class to:
- 1. Shift the values of the time series to create a Pandas dataframe containing all the data
- for a single training example
- 2. Discard any samples with missing values
- 3. Transform this Pandas dataframe into a numpy array of shape
- (samples, time steps, features) for input into Keras
- The class takes the following parameters:
- - **dataset**: original time series
- - **target** name of the target column
- - **H**: the forecast horizon
- - **tensor_structures**: a dictionary describing the tensor structure of the form
- { 'tensor_name' : (range(max_backward_shift, max_forward_shift), [feature, feature, ...] ) }
- if features are non-sequential and should not be shifted, use the form
- { 'tensor_name' : (None, [feature, feature, ...])}
- - **freq**: time series frequency (default 'H' - hourly)
- - **drop_incomplete**: (Boolean) whether to drop incomplete samples (default True)
- """
-
- def __init__(self, dataset, target, H, tensor_structure, freq='H', drop_incomplete=True):
- self.dataset = dataset
- self.target = target
- self.tensor_structure = tensor_structure
- self.tensor_names = list(tensor_structure.keys())
-
- self.dataframe = self._shift_data(H, freq, drop_incomplete)
- self.data = self._df2tensors(self.dataframe)
-
- def _shift_data(self, H, freq, drop_incomplete):
-
- # Use the tensor_structures definitions to shift the features in the original dataset.
- # The result is a Pandas dataframe with multi-index columns in the hierarchy
- # tensor - the name of the input tensor
- # feature - the input feature to be shifted
- # time step - the time step for the RNN in which the data is input. These labels
- # are centred on time t. the forecast creation time
- df = self.dataset.copy()
-
- idx_tuples = []
- for t in range(1, H+1):
- df['t+'+str(t)] = df[self.target].shift(t*-1, freq=freq)
- idx_tuples.append(('target', 'y', 't+'+str(t)))
- for name, structure in self.tensor_structure.items():
- rng = structure[0]
- dataset_cols = structure[1]
-
- for col in dataset_cols:
-
- # do not shift non-sequential 'static' features
- if rng is None:
- df['context_'+col] = df[col]
- idx_tuples.append((name, col, 'static'))
- else:
- for t in rng:
- sign = '+' if t > 0 else ''
- shift = str(t) if t != 0 else ''
- period = 't'+sign+shift
- shifted_col = name+'_'+col+'_'+period
- df[shifted_col] = df[col].shift(t*-1, freq=freq)
- idx_tuples.append((name, col, period))
-
- df = df.drop(self.dataset.columns, axis=1)
- idx = pd.MultiIndex.from_tuples(idx_tuples, names=['tensor', 'feature', 'time step'])
- df.columns = idx
- if drop_incomplete:
- df = df.dropna(how='any')
- return df
-
- def _df2tensors(self, dataframe):
-
- # Transform the shifted Pandas dataframe into the multidimensional numpy arrays. These
- # arrays can be used to input into the keras model and can be accessed by tensor name.
- # For example, for a TimeSeriesTensor object named "model_inputs" and a tensor named
- # "target", the input tensor can be acccessed with model_inputs['target']
-
- inputs = {}
- y = dataframe['target']
- y = y.as_matrix()
- inputs['target'] = y
- for name, structure in self.tensor_structure.items():
- rng = structure[0]
- cols = structure[1]
- tensor = dataframe[name][cols].as_matrix()
- if rng is None:
- tensor = tensor.reshape(tensor.shape[0], len(cols))
- else:
- tensor = tensor.reshape(tensor.shape[0], len(cols), len(rng))
- tensor = np.transpose(tensor, axes=[0, 2, 1])
- inputs[name] = tensor
- return inputs
-
- def subset_data(self, new_dataframe):
-
- # Use this function to recreate the input tensors if the shifted dataframe
- # has been filtered.
-
- self.dataframe = new_dataframe
- self.data = self._df2tensors(self.dataframe)
|