Cross-Validation Training

# Import necessary packages. 
import pandas as pd
import numpy as np 
import math
import time
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import copy

Todo:

Examine the distribution of the predicted track and validation track.
Setting up the threshold for calculating the MSE specifically for peaks.
Creating Models for each histone marks.

Data Preparation

# Reading data.
ml_df = pd.read_csv("sources/ML_model/output/ml_data.csv", header=0)

# Create the shuffled dataframe for randomly selecting the folds. 
ml_df_shuf = ml_df.sample(frac=1)

# Define number of folds. 
k = 4

# Create index for the folds. 
folds_index = list(range(0, ml_df_shuf.shape[0], math.ceil(ml_df_shuf.shape[0]/k))) + [ml_df_shuf.shape[0]]
folds_index = [[folds_index[i]+1, folds_index[i+1]] for i in range(len(folds_index)-1)]
folds_index[0][0] = 0
folds_index

[[0, 303615], [303616, 607230], [607231, 910845], [910846, 1214460]]

ml_df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	avo	cur	cell	mark	ideas	valid
0	0.246995	1.801744e-01	C46	M03	0	0.226440
1	0.248117	1.801744e-01	C46	M03	92	0.226440
2	0.248174	1.801744e-01	C46	M03	0	0.226440
3	0.247927	1.801744e-01	C46	M03	0	0.226440
4	0.248312	1.801744e-01	C46	M03	0	0.226440
5	0.247233	1.801744e-01	C46	M03	19	0.226440
6	0.247316	1.801744e-01	C46	M03	19	0.226440
7	0.247123	1.801744e-01	C46	M03	43	0.226440
8	0.246343	1.801744e-01	C46	M03	19	0.226440
9	0.248208	1.801744e-01	C46	M03	0	0.226440
10	0.248134	1.801744e-01	C46	M03	0	0.226440
11	0.248359	1.801744e-01	C46	M03	0	0.226440
12	0.248478	1.801744e-01	C46	M03	0	0.226440
13	0.248123	1.801744e-01	C46	M03	0	0.226440
14	0.248436	1.801744e-01	C46	M03	0	0.226440
15	0.249592	1.801744e-01	C46	M03	0	0.226440
16	0.248901	1.801744e-01	C46	M03	0	0.226440
17	0.248855	1.801744e-01	C46	M03	0	0.226440
18	0.250269	1.801744e-01	C46	M03	0	0.226440
19	0.248659	1.801744e-01	C46	M03	0	0.226440
20	0.248206	1.801744e-01	C46	M03	2	0.226440
21	0.248728	1.801744e-01	C46	M03	0	0.226440
22	0.249281	1.801744e-01	C46	M03	0	0.226440
23	0.249216	1.801744e-01	C46	M03	0	0.226440
24	0.248470	1.801744e-01	C46	M03	0	0.226440
25	0.248940	1.801744e-01	C46	M03	19	0.226440
26	0.249946	1.801744e-01	C46	M03	0	0.226440
27	0.249419	1.801744e-01	C46	M03	0	0.226440
28	0.249438	1.801744e-01	C46	M03	0	0.226440
29	0.248860	1.801744e-01	C46	M03	15	0.226440
...	...	...	...	...	...	...
1214430	0.182170	2.794365e-01	C23	M25	0	0.134385
1214431	0.142648	2.398810e-01	C23	M25	0	0.000000
1214432	0.089499	7.356942e-02	C23	M25	0	0.000000
1214433	0.085755	7.330097e-02	C23	M25	0	0.000000
1214434	0.088690	5.963657e-02	C23	M25	0	0.000000
1214435	0.074589	5.904327e-02	C23	M25	0	0.000000
1214436	0.071283	5.898153e-02	C23	M25	0	0.000000
1214437	0.068884	5.900870e-02	C23	M25	0	0.000000
1214438	0.062454	6.090621e-02	C23	M25	0	0.000000
1214439	0.066037	5.905338e-02	C23	M25	0	0.000000
1214440	0.081037	5.905134e-02	C23	M25	0	0.000000
1214441	0.079494	5.905811e-02	C23	M25	0	0.000000
1214442	0.079866	5.901047e-02	C23	M25	0	0.000000
1214443	0.080911	5.904380e-02	C23	M25	0	0.000000
1214444	0.078838	5.904843e-02	C23	M25	0	0.000000
1214445	0.064501	5.938006e-02	C23	M25	0	0.000000
1214446	0.060386	3.004827e-03	C23	M25	0	0.000000
1214447	0.058836	6.579292e-04	C23	M25	0	0.000000
1214448	0.057562	5.937061e-04	C23	M25	0	0.000000
1214449	0.058535	1.503559e-03	C23	M25	0	0.000000
1214450	0.060157	3.682951e-07	C23	M25	0	0.000000
1214451	0.062341	0.000000e+00	C23	M25	0	0.000000
1214452	0.061212	0.000000e+00	C23	M25	0	0.000000
1214453	0.061993	0.000000e+00	C23	M25	0	0.000000
1214454	0.061545	0.000000e+00	C23	M25	0	0.000000
1214455	0.063010	0.000000e+00	C23	M25	0	0.000000
1214456	0.060978	0.000000e+00	C23	M25	0	0.000000
1214457	0.061748	0.000000e+00	C23	M25	0	0.000000
1214458	0.061063	0.000000e+00	C23	M25	0	0.000000
1214459	0.061999	0.000000e+00	C23	M25	0	0.000000

1214460 rows × 6 columns

# Creating label and feature dataset. 
label = ml_df_shuf["valid"].to_numpy()
ml_df_shuf.drop(columns=["valid"])
features_dummies = pd.get_dummies(ml_df_shuf)
features = features_dummies.to_numpy() 
print(label[0:5])
print(features[0:5,:])

[0.4194066  0.47912769 0.51961    0.499787   0.176605  ]
[[0.468564   0.48123019 0.         0.4194066  0.         0.
  0.         0.         0.         0.         0.         1.
  0.         0.         0.         1.         0.        ]
 [0.51875847 0.31617134 0.         0.47912769 0.         0.
  0.         0.         0.         0.         1.         0.
  0.         0.         1.         0.         0.        ]
 [0.5656851  0.16588938 0.         0.51961    1.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         1.         0.         0.        ]
 [0.14765898 0.12652433 0.         0.499787   0.         0.
  0.         0.         0.         1.         0.         0.
  0.         0.         1.         0.         0.        ]
 [0.18135708 0.17125739 0.         0.176605   0.         0.
  0.         0.         0.         0.         0.         0.
  0.         1.         0.         0.         1.        ]]

Cross-Validation Training

def time_stamp(): 
    print("[{}]".format(time.time()))

def cv_train(model_name, label, feature, folds): 
    predicted_set = {}
    iter_index = 0

    for validation_set in folds: 
        print("---- Creating Validation Set for data in range {}".format(validation_set))
        if model_name == "RF": 
            regr = RandomForestRegressor()
        else: 
            # Part for adding more models later. 
            print("#### Invalid Model Name. Terminating Training")
            break

        # Creating train and valid features, labels. 
        curr_test_feature = feature[validation_set[0]:validation_set[1],:]
        curr_test_label = label[validation_set[0]:validation_set[1]]
        curr_train_feature = np.delete(feature, np.s_[validation_set[0]:validation_set[1]+1], axis=0)
        curr_train_label = np.delete(label, np.s_[validation_set[0]:validation_set[1]+1], axis=0)

        start_time = time.time()
        print("-------- Start Training on {}".format(start_time))
        regr.fit(curr_train_feature, curr_train_label)
        print("-------- Finished Training, elapsed time: {}".format(time.time() - start_time))
        predicted_set[iter_index] = {
            "model": regr, 
            "predicted": regr.predict(curr_test_feature),
            "test_label": curr_test_label,
            "avocado": curr_test_feature[:,0],
            "curr_impute": curr_test_feature[:,1]
        }
        iter_index += 1

    return predicted_set

def pretty(d, indent=0):
   for key, value in d.items():
      print('\t' * indent + str(key))
      if isinstance(value, dict):
         pretty(value, indent+1)
      else:
         print('\t' * (indent+1) + str(value))

model_dic = cv_train("RF", label, features, folds_index)

---- Creating Validation Set for data in range [0, 303615]
-------- Start Training on 1574182987.977485
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 58.52015209197998
---- Creating Validation Set for data in range [303616, 607230]
-------- Start Training on 1574183048.340859
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 58.19038510322571
---- Creating Validation Set for data in range [607231, 910845]
-------- Start Training on 1574183108.437985
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 57.9245810508728
---- Creating Validation Set for data in range [910846, 1214460]
-------- Start Training on 1574183168.262582
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 57.9299750328064

# Calculating the MSE for each fold. 
MSE_dict = {}
for i in range(4): 
    MSE_dict[i] = {
        "predicted": round(np.mean(abs(model_dic[i]["predicted"]-model_dic[i]["test_label"])), 4), 
        "avocado": round(np.mean(abs(model_dic[i]["avocado"]-model_dic[i]["test_label"])), 4),
        "curr_impute": round(np.mean(abs(model_dic[i]["curr_impute"]-model_dic[i]["test_label"])), 4)
    }
pretty(MSE_dict)

0
	predicted
		0.1372
	avocado
		0.1537
	curr_impute
		0.2792
1
	predicted
		0.1402
	avocado
		0.1577
	curr_impute
		0.2832
2
	predicted
		0.1361
	avocado
		0.1529
	curr_impute
		0.277
3
	predicted
		0.1367
	avocado
		0.1546
	curr_impute
		0.2823

# Pairwise Data Visualization. 
for i in range(4): 
    temp_dic = copy.deepcopy(model_dic[i])
    temp_dic.pop("model")
    temp_plot_df = pd.DataFrame(temp_dic)
    temp_plot_df.plot(subplots=True, figsize=(10, 6))

# Plot the feature importance. 
feat_imp_li = model_dic[0]["model"].feature_importances_
feat_importance = pd.Series(feat_imp_li, index=features_orig.columns)
feat_importance.nlargest(6).plot(kind='barh')
# print(model_dic[0]["model"].feature_importances_)

<matplotlib.axes._subplots.AxesSubplot at 0x1abfdb58d0>

Cross-Validated Training on Datasets Without IDEAS State

print(features[0,:])

[0.2037363  0.25777569 4.         0.         0.         1.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         1.        ]

ml_df_shuf.head(5)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	avo	cur	cell	mark	valid
792090	0.468564	0.481230	C32	M21	0.419407
1151646	0.518758	0.316171	C31	M03	0.479128
384424	0.565685	0.165889	C17	M03	0.519610
579290	0.147659	0.126524	C27	M03	0.499787
956422	0.181357	0.171257	C46	M25	0.176605

# Remove all IDEAS State dummies variable. 
features_wt_idea = ml_df_shuf[["avo", "cur", "cell", "mark"]]
features_wt_idea = pd.get_dummies(features_wt_idea).to_numpy()
features_wt_idea[0,:]

array([0.468564  , 0.48123019, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ])

model_dic_id_rmd = cv_train("RF", label, features_wt_idea, folds_index)

---- Creating Validation Set for data in range [0, 303615]
-------- Start Training on 1574183999.332468
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 56.25135779380798
---- Creating Validation Set for data in range [303616, 607230]
-------- Start Training on 1574184057.5759418
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 55.562814235687256
---- Creating Validation Set for data in range [607231, 910845]
-------- Start Training on 1574184115.145239
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 63.64612078666687
---- Creating Validation Set for data in range [910846, 1214460]
-------- Start Training on 1574184181.19211
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 67.51104378700256

# Calculating the MSE for each fold. 
MSE_dict = {}
for i in range(4): 
    MSE_dict[i] = {
        "predicted": round(np.mean(abs(model_dic_id_rmd[i]["predicted"]-model_dic_id_rmd[i]["test_label"])), 4), 
        "avocado": round(np.mean(abs(model_dic_id_rmd[i]["avocado"]-model_dic_id_rmd[i]["test_label"])), 4),
        "curr_impute": round(np.mean(abs(model_dic_id_rmd[i]["curr_impute"]-model_dic_id_rmd[i]["test_label"])), 4)
    }
pretty(MSE_dict)

0
	predicted
		0.1371
	avocado
		0.155
	curr_impute
		0.2808
1
	predicted
		0.1385
	avocado
		0.1552
	curr_impute
		0.2796
2
	predicted
		0.1389
	avocado
		0.1541
	curr_impute
		0.2796
3
	predicted
		0.1381
	avocado
		0.1545
	curr_impute
		0.2817

Creating Models for Each Histone Marks

Data Preparation

ml_df_shuf.head(5)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	avo	cur	cell	mark	valid
792090	0.468564	0.481230	C32	M21	0.419407
1151646	0.518758	0.316171	C31	M03	0.479128
384424	0.565685	0.165889	C17	M03	0.519610
579290	0.147659	0.126524	C27	M03	0.499787
956422	0.181357	0.171257	C46	M25	0.176605

# Print unique marks. 
marks_list = ml_df_shuf.mark.unique()
print(marks_list)

['M21' 'M03' 'M25']

def hist_models(mark_id, orig_df, ideas=True):
    df = orig_df[orig_df.mark == mark_id]
    label = df["valid"].to_numpy()
    if ideas: 
        features = pd.get_dummies(df[["avo", "cur", "cell", "ideas"]]).to_numpy()
    else: 
        features = pd.get_dummies(df[["avo", "cur", "cell"]]).to_numpy()

    # Define number of folds. 
    k = 4

    # Create index for the folds. 
    folds_index = list(range(0, df.shape[0], math.ceil(df.shape[0]/k))) + [df.shape[0]]
    folds_index = [[folds_index[i]+1, folds_index[i+1]] for i in range(len(folds_index)-1)]
    folds_index[0][0] = 0
    folds_index

    model_dic = cv_train("RF", label, features, folds_index)

    MSE_dict = {}
    for i in range(4): 
        MSE_dict[i] = {
            "predicted": round(np.mean(abs(model_dic[i]["predicted"]-model_dic[i]["test_label"])), 4), 
            "avocado": round(np.mean(abs(model_dic[i]["avocado"]-model_dic[i]["test_label"])), 4),
            "curr_impute": round(np.mean(abs(model_dic[i]["curr_impute"]-model_dic[i]["test_label"])), 4)
        }
    pretty(MSE_dict)

    return {
        "model_dic": model_dic, 
        "MSE": MSE_dict
    }

Model for M21 - H3K4me2

m21_result = hist_models("M21", ml_df_shuf)

---- Creating Validation Set for data in range [0, 116775]
-------- Start Training on 1574188344.497136
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 17.838374853134155
---- Creating Validation Set for data in range [116776, 233550]
-------- Start Training on 1574188362.94751
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 17.883373975753784
---- Creating Validation Set for data in range [233551, 350325]
-------- Start Training on 1574188381.4168372
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 17.50961685180664
---- Creating Validation Set for data in range [350326, 467100]
-------- Start Training on 1574188399.50402
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 17.69806694984436
0
	predicted
		0.1315
	avocado
		0.1523
	curr_impute
		0.3132
1
	predicted
		0.1315
	avocado
		0.1493
	curr_impute
		0.3041
2
	predicted
		0.1337
	avocado
		0.1541
	curr_impute
		0.3129
3
	predicted
		0.1332
	avocado
		0.1519
	curr_impute
		0.3134

m21_result_wt_ideas = hist_models("M21", ml_df_shuf, False)

---- Creating Validation Set for data in range [0, 116775]
-------- Start Training on 1574188997.884465
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 16.571380853652954
---- Creating Validation Set for data in range [116776, 233550]
-------- Start Training on 1574189015.0444632
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 16.519545793533325
---- Creating Validation Set for data in range [233551, 350325]
-------- Start Training on 1574189032.130722
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 16.46050000190735
---- Creating Validation Set for data in range [350326, 467100]
-------- Start Training on 1574189049.203674
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 17.15988779067993
0
	predicted
		0.1319
	avocado
		0.1523
	curr_impute
		0.3132
1
	predicted
		0.1318
	avocado
		0.1493
	curr_impute
		0.3041
2
	predicted
		0.1339
	avocado
		0.1541
	curr_impute
		0.3129
3
	predicted
		0.1332
	avocado
		0.1519
	curr_impute
		0.3134

Model for M03 - H2AFZ

m03_result = hist_models("M03", ml_df_shuf)

---- Creating Validation Set for data in range [0, 93420]
-------- Start Training on 1574188493.600275
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 13.731431722640991
---- Creating Validation Set for data in range [93421, 186840]
-------- Start Training on 1574188507.847035
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 12.977031230926514
---- Creating Validation Set for data in range [186841, 280260]
-------- Start Training on 1574188521.262924
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 12.248181104660034
---- Creating Validation Set for data in range [280261, 373680]
-------- Start Training on 1574188533.940921
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 12.40692687034607
0
	predicted
		0.1064
	avocado
		0.1181
	curr_impute
		0.2546
1
	predicted
		0.1058
	avocado
		0.1173
	curr_impute
		0.255
2
	predicted
		0.1081
	avocado
		0.1195
	curr_impute
		0.2601
3
	predicted
		0.1075
	avocado
		0.1199
	curr_impute
		0.2613

m03_result_wt_ideas = hist_models("M03", ml_df_shuf, False)

---- Creating Validation Set for data in range [0, 93420]
-------- Start Training on 1574188889.4063501
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 12.069170951843262
---- Creating Validation Set for data in range [93421, 186840]
-------- Start Training on 1574188901.90952
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 11.77048110961914
---- Creating Validation Set for data in range [186841, 280260]
-------- Start Training on 1574188914.099287
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 11.769668817520142
---- Creating Validation Set for data in range [280261, 373680]
-------- Start Training on 1574188926.2890701
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 11.894800901412964
0
	predicted
		0.1067
	avocado
		0.1181
	curr_impute
		0.2546
1
	predicted
		0.1053
	avocado
		0.1173
	curr_impute
		0.255
2
	predicted
		0.1079
	avocado
		0.1195
	curr_impute
		0.2601
3
	predicted
		0.1077
	avocado
		0.1199
	curr_impute
		0.2613

Model for M25 - H3K79me2

m25_result = hist_models("M25", ml_df_shuf)

---- Creating Validation Set for data in range [0, 93420]
-------- Start Training on 1574188633.599432
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 13.147719860076904
---- Creating Validation Set for data in range [93421, 186840]
-------- Start Training on 1574188647.193089
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 12.824733972549438
---- Creating Validation Set for data in range [186841, 280260]
-------- Start Training on 1574188660.440157
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 12.775397062301636
---- Creating Validation Set for data in range [280261, 373680]
-------- Start Training on 1574188673.624012
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 12.997114896774292
0
	predicted
		0.1729
	avocado
		0.1953
	curr_impute
		0.2664
1
	predicted
		0.1769
	avocado
		0.2005
	curr_impute
		0.2736
2
	predicted
		0.1698
	avocado
		0.1891
	curr_impute
		0.2577
3
	predicted
		0.1713
	avocado
		0.192
	curr_impute
		0.2624

m25_result_wt_ideas = hist_models("M25", ml_df_shuf, False)

---- Creating Validation Set for data in range [0, 93420]
-------- Start Training on 1574189102.0591948
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 11.923866271972656
---- Creating Validation Set for data in range [93421, 186840]
-------- Start Training on 1574189114.4053
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 11.734225988388062
---- Creating Validation Set for data in range [186841, 280260]
-------- Start Training on 1574189126.537584
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 11.834797859191895
---- Creating Validation Set for data in range [280261, 373680]
-------- Start Training on 1574189138.7640052
/Users/Michavillson/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
-------- Finished Training, elapsed time: 13.448953866958618
0
	predicted
		0.1733
	avocado
		0.1953
	curr_impute
		0.2664
1
	predicted
		0.1786
	avocado
		0.2005
	curr_impute
		0.2736
2
	predicted
		0.1724
	avocado
		0.1891
	curr_impute
		0.2577
3
	predicted
		0.1714
	avocado
		0.192
	curr_impute
		0.2624

Convert the Above Statistics into Tidy Data Format

print(marks_list)

['M21' 'M03' 'M25']

hist_list = [["Mark_id", "Iteration", "predicted", "avocado", "curr_impute", "ideas"]]
M25_list =  [["M25", i[0], i[1]["predicted"], i[1]["avocado"], i[1]["curr_impute"], False] for i in m25_result_wt_ideas["MSE"].items()]
M25_list += [["M25", i[0], i[1]["predicted"], i[1]["avocado"], i[1]["curr_impute"], True] for i in m25_result["MSE"].items()]
M03_list =  [["M03", i[0], i[1]["predicted"], i[1]["avocado"], i[1]["curr_impute"], False] for i in m03_result_wt_ideas["MSE"].items()]
M03_list += [["M03", i[0], i[1]["predicted"], i[1]["avocado"], i[1]["curr_impute"], True] for i in m03_result["MSE"].items()]
M21_list =  [["M21", i[0], i[1]["predicted"], i[1]["avocado"], i[1]["curr_impute"], False] for i in m21_result_wt_ideas["MSE"].items()]
M21_list += [["M21", i[0], i[1]["predicted"], i[1]["avocado"], i[1]["curr_impute"], True] for i in m21_result["MSE"].items()]
hist_df = pd.DataFrame(M25_list + M03_list + M21_list, columns = hist_list[0])

hist_df.to_csv("sources/ML_model/output/mse_hist_marks_model.csv", index=False)
hist_df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Mark_id	Iteration	predicted	avocado	curr_impute	ideas
0	M25	0	0.1733	0.1953	0.2664	False
1	M25	1	0.1786	0.2005	0.2736	False
2	M25	2	0.1724	0.1891	0.2577	False
3	M25	3	0.1714	0.1920	0.2624	False
4	M25	0	0.1729	0.1953	0.2664	True
5	M25	1	0.1769	0.2005	0.2736	True
6	M25	2	0.1698	0.1891	0.2577	True
7	M25	3	0.1713	0.1920	0.2624	True
8	M03	0	0.1067	0.1181	0.2546	False
9	M03	1	0.1053	0.1173	0.2550	False
10	M03	2	0.1079	0.1195	0.2601	False
11	M03	3	0.1077	0.1199	0.2613	False
12	M03	0	0.1064	0.1181	0.2546	True
13	M03	1	0.1058	0.1173	0.2550	True
14	M03	2	0.1081	0.1195	0.2601	True
15	M03	3	0.1075	0.1199	0.2613	True
16	M21	0	0.1319	0.1523	0.3132	False
17	M21	1	0.1318	0.1493	0.3041	False
18	M21	2	0.1339	0.1541	0.3129	False
19	M21	3	0.1332	0.1519	0.3134	False
20	M21	0	0.1315	0.1523	0.3132	True
21	M21	1	0.1315	0.1493	0.3041	True
22	M21	2	0.1337	0.1541	0.3129	True
23	M21	3	0.1332	0.1519	0.3134	True

SecantZhang / DS340W-Group10-FA19

Cross-Validation Training

Todo:

Data Preparation

Cross-Validation Training

Cross-Validated Training on Datasets Without IDEAS State

Creating Models for Each Histone Marks

Data Preparation

Model for M21 - H3K4me2

Model for M03 - H2AFZ

Model for M25 - H3K79me2

Convert the Above Statistics into Tidy Data Format

About

Languages