saadmahboob/simplecryptoRNN

simplecryptoRNN
modified some things.
mostly from: https://www.youtube.com/watch?v=ne-dpRdNReI
import pandas as pd
import os
from sklearn import preprocessing  # will throw warning about imp on cloudpickle
import numpy as np
from collections import deque
import random
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
# To Use CuDNNLSTM: modify the LSTMs below, delete activation functions as well
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

# CONSTANTS
SEQ_LEN = 60  # last 60 minutes of pricing data
FUTURE_PERIOD_PREDICT = 3  # want to predict out 3 minutes
RATIO_TO_PREDICT = 'LTC-USD'  # which ratio we want to predict on
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

def classify(current, future):
    if float(future) > float(current):
        return 1  # should buy
    else:
        return 0


def scaling(df):
    df = df.drop('future', 1)  # don't need this column anymore, don't train on this

    # scale columns by percent change
    for col in df.columns:
        # don't scale target column
        if col != 'target':

            df[col] = df[col].pct_change()  # normalizing by the pct change
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values)  # scaling between 0 and 1
    df.dropna(inplace=True)  # catch any na caused by scaling

    return df


# def sliding_window(dataset, seq_len, stride):
#     n_samples, d = dataset.shape
#     data_slide = np.zeros((int((n_samples - seq_len) / stride) + 1, seq_len, d))
#     k = 0

#     for i in range(0, n_samples - seq_len, stride):
#         data_slide[k, :, :] = dataset[i:i + seq_len, :]
#         k = k + 1


def preprocess_df(df):

    # the sliding window for sequential data
    prev_days = deque(maxlen=SEQ_LEN)

    # where the sliding windows are appended
    sequential_data = []

    # .values returns a numpy representation of the DataFrame
    for i in df.values:
        prev_days.append(i[:-1])  # leave target column out of prev_day data

        # prev_days is now a window as large as we want it to be
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])  # i[-1] is the target column label = y

    random.shuffle(sequential_data)

    # balance the data
    buys = []
    sells = []

    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])

    # what is minimum length of two
    lower = min(len(buys), len(sells))

    # rebalanced to minimum length
    buys = buys[:lower]
    sells = sells[:lower]

    sequential_data = buys + sells

    # shuffle buys and sells from being ordered
    random.shuffle(sequential_data)

    X = []
    y = []

    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)

    return np.array(X), y


# crypto_data should be in same folder
dir_path = os.path.dirname(os.path.realpath(__file__))
os.chdir(dir_path)


# read in the data
main_df = pd.DataFrame()
ratios = ['BTC-USD', 'LTC-USD', 'ETH-USD', 'BCH-USD']

for ratio in ratios:
    dataset = f"crypto_data/{ratio}.csv"

    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    df.rename(columns={'close': f'{ratio}_close', 'volume': f'{ratio}_volume'}, inplace=True)

    df.set_index('time', inplace=True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]

    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)

# create future column that is the price of LTC-USD shifted back 3 minutes
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)

# use classifier to make a target column that has value 1 when value is larger in future
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
# print(main_df[[f'{RATIO_TO_PREDICT}_close', 'future', 'target']].head(10))

# use last 5% of data for testing
times = sorted(main_df.index.values)
last_5pct = int(-len(times) * 0.05)  # index before last 5% of data is test
last_10pct = int(-len(times) * 0.1)  # index from 10% of to 5% of data is validation

# the testing data
testing_df = main_df[last_5pct:]
testing_df = scaling(testing_df)

# keep validation and training data together when first scaling
main_df = main_df[:last_5pct]
main_df = scaling(main_df)

# split into train and validation
training_df = main_df[:last_10pct]
validation_df = main_df[last_10pct:last_5pct]

# preprocess the data
train_x, train_y = preprocess_df(training_df)
validation_x, validation_y = preprocess_df(validation_df)

# some statistics
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"don't buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION don't buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

model = Sequential()
model.add(LSTM(128, activation='tanh', input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, activation='tanh', input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, activation='tanh', input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt = tf.keras.optimizers.Adam(lr = 0.001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f"logs/{NAME}")

# Save the Models
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))  # saves only the best ones

history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs = EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint])

# check tensorboard with: tensorboard --logdir=logs 
# need to be in directory where logs would be
saadmahboob / simplecryptoRNN

About

Languages