Spaces:
Running
Running
File size: 5,752 Bytes
94ed9e1 2f54913 94ed9e1 2f54913 94ed9e1 2f54913 94ed9e1 5446c65 94ed9e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import os
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
"""
Data Mining Assignment - Group 5
"""
from warnings import filterwarnings
filterwarnings('ignore')
class DataProcessor:
def __init__(self, datasets_path):
self.datasets_path = datasets_path
self.datasets = self._get_datasets()
def _get_datasets(self):
return sorted([
item for item in os.listdir(self.datasets_path)
if os.path.isfile(os.path.join(self.datasets_path, item)) and item.endswith('.csv')
])
@staticmethod
def create_sequences(df, sequence_length):
labels, sequences = [], []
for i in range(len(df) - sequence_length):
seq = df.iloc[i:i + sequence_length].values
label = df.iloc[i + sequence_length].values[0]
sequences.append(seq)
labels.append(label)
return np.array(sequences), np.array(labels)
@staticmethod
def preprocess_data(dataframe):
for col in dataframe.columns:
if dataframe[col].isnull().any():
if dataframe[col].dtype == 'object':
dataframe[col].fillna(dataframe[col].mode()[0], inplace = True)
else:
dataframe[col].fillna(dataframe[col].mean(), inplace = True)
return dataframe
@staticmethod
def scale_data(dataframe, scaler_cls):
scaler = scaler_cls()
dataframe['Close'] = scaler.fit_transform(dataframe[['Close']])
return scaler, dataframe
class ModelBuilder:
@staticmethod
def build_model(input_shape):
model = Sequential([
LSTM(units=128, return_sequences=True, input_shape=input_shape),
Dropout(0.2),
Dense(128, activation='relu'),
LSTM(units=64, return_sequences=True),
Dropout(0.2),
Dense(64, activation='relu'),
LSTM(units=64, return_sequences=True),
Dropout(0.2),
Dense(64, activation='relu'),
LSTM(units=64, return_sequences=False),
Dropout(0.2),
Dense(64, activation='relu'),
Dense(32, activation='softmax'),
Dense(units=1)
])
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
return model
class Trainer:
def __init__(self, model, model_file, sequence_length, epochs, batch_size):
self.model = model
self.model_file = model_file
self.sequence_length = sequence_length
self.epochs = epochs
self.batch_size = batch_size
def train(self, X_train, y_train, X_test, y_test):
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 5, mode = 'min')
model_checkpoint = ModelCheckpoint(
filepath = self.model_file,
save_best_only = True,
monitor = 'val_loss',
mode = 'min'
)
history = self.model.fit(
X_train, y_train,
epochs = self.epochs,
batch_size = self.batch_size,
validation_data = (X_test, y_test),
callbacks = [early_stopping, model_checkpoint]
)
return history
class PostProcessor:
@staticmethod
def inverse_transform(scaler, data):
return scaler.inverse_transform(data)
@staticmethod
def save_json(filename, data):
with open(filename, 'w') as f:
json.dump(data, f)
def main():
datasets_path = './datasets'
models_path = './models'
posttrained = './posttrained'
pickle_file = './pickles'
sequence_length = 60
epochs = 200
batch_size = 32
data_processor = DataProcessor(datasets_path)
for dataset in data_processor.datasets:
print(f"[TRAINING] {dataset.replace('.csv', '')} ")
dataframe = pd.read_csv(os.path.join(datasets_path, dataset), index_col='Date')[['Close']]
model_file = os.path.join(models_path, f"{dataset.replace('.csv', '')}.keras")
# dataframe = data_processor.preprocess_data(dataframe)
dataframe.dropna(inplace = True)
standard_scaler, dataframe = data_processor.scale_data(dataframe, StandardScaler)
minmax_scaler, dataframe = data_processor.scale_data(dataframe, MinMaxScaler)
sequences, labels = data_processor.create_sequences(dataframe, sequence_length)
input_shape = (sequences.shape[1], sequences.shape[2])
model = ModelBuilder.build_model(input_shape)
train_size = int(len(sequences) * 0.8)
X_train, X_test = sequences[:train_size], sequences[train_size:]
y_train, y_test = labels[:train_size], labels[train_size:]
trainer = Trainer(model, model_file, sequence_length, epochs, batch_size)
trainer.train(X_train, y_train, X_test, y_test)
dataframe_json = {'Date': dataframe.index.tolist(), 'Close': dataframe['Close'].tolist()}
PostProcessor.save_json(
os.path.join(posttrained, f'{dataset.replace(".csv", "")}-posttrained.json'),
dataframe_json
)
joblib.dump(minmax_scaler, os.path.join(pickle_file, f'{dataset.replace(".csv", "")}_minmax_scaler.pickle'))
joblib.dump(standard_scaler, os.path.join(pickle_file, f'{dataset.replace(".csv", "")}_standard_scaler.pickle'))
model.load_weights(model_file)
model.save(model_file)
print("\n\n")
if __name__ == "__main__":
main()
|