Solutions for the Kaggle: PMRCT
Personalized Medicine : Redefining Cancer Treatment
Predict the effect of Genetic Variants to enable Personalized Medicine
import numpy as np # Linear Algebra
import pandas as pd # Data processing
from pandas import HDFStore
import matplotlib .pyplot as plt # Data Visualization
% matplotlib inline
import seaborn as sns # Visualization
from sklearn .feature_extraction .text import TfidfVectorizer , ENGLISH_STOP_WORDS , CountVectorizer
from sklearn .decomposition import TruncatedSVD , PCA
from sklearn .model_selection import cross_val_predict , StratifiedKFold , train_test_split , GridSearchCV
from sklearn .metrics import log_loss , accuracy_score
from collections import Counter
from sklearn .ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import re
import nltk
Data Importing & Preprocessing
train_text_df = pd .read_csv ('data/training_text' , sep = "\|\|" , engine = "python" , skiprows = 1 , names = ["ID" , "Text" ])
test_text_df = pd .read_csv ('data/test_text' , sep = "\|\|" , engine = "python" , skiprows = 1 , names = ["ID" , "Text" ])
train_vari_df = pd .read_csv ('data/training_variants' )
test_vari_df = pd .read_csv ('data/test_variants' )
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
ID
Text
0
0
Cyclin-dependent kinases (CDKs) regulate a var...
1
1
Abstract Background Non-small cell lung canc...
2
2
Abstract Background Non-small cell lung canc...
3
3
Recent evidence has demonstrated that acquired...
4
4
Oncogenic mutations in the monomeric Casitas B...
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
ID
Gene
Variation
Class
3316
3316
RUNX1
D171N
4
3317
3317
RUNX1
A122*
1
3318
3318
RUNX1
Fusions
1
3319
3319
RUNX1
R80C
4
3320
3320
RUNX1
K83E
4
df = train_vari_df .join (train_text_df .set_index ('ID' ), on = 'ID' )
df_y = df ['Class' ].values
df_X = df [['Gene' , 'Variation' , 'Text' ]]
print ( type (df_y ) )
print ( type (df_X ) )
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
df_test = test_vari_df .join (test_text_df .set_index ('ID' ), on = 'ID' )
df_test = df_test .iloc [:,1 :]
df_new = pd .concat ([df_X , df_test ], ignore_index = True )
print ( type (df_new ), df_new .shape )
<class 'pandas.core.frame.DataFrame'> (8989, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8989 entries, 0 to 8988
Data columns (total 3 columns):
Gene 8989 non-null object
Variation 8989 non-null object
Text 8989 non-null object
dtypes: object(3)
memory usage: 210.8+ KB
Extracting Features
tfidf_vect = TfidfVectorizer ()
stop_words = ENGLISH_STOP_WORDS
def text_Decomposition (text ):
text = re .sub (r"[^a-zA-Z0-9^,!./\+-_=]" , " " , text )
text = text .lower ().split ()
text = [i for i in text if not i in stop_words ]
text = " " .join (text )
text = text .replace ("." ," " ).replace ("," ," " )
return (text )
df_new ['Text' ] = df_new ['Text' ].apply (text_Decomposition )
0 cyclin-dependent kinases cdks regulate variety...
1 abstract background non-small cell lung cancer...
2 abstract background non-small cell lung cancer...
3 recent evidence demonstrated acquired uniparen...
4 oncogenic mutations monomeric casitas b-lineag...
5 oncogenic mutations monomeric casitas b-lineag...
6 oncogenic mutations monomeric casitas b-lineag...
7 cbl negative regulator activated receptor tyro...
8 abstract juvenile myelomonocytic leukemia jmml...
9 abstract juvenile myelomonocytic leukemia jmml...
10 oncogenic mutations monomeric casitas b-lineag...
11 noonan syndrome autosomal dominant congenital ...
12 noonan syndrome autosomal dominant congenital ...
13 noonan syndrome autosomal dominant congenital ...
14 oncogenic mutations monomeric casitas b-lineag...
15 noonan syndrome autosomal dominant congenital ...
16 determine residual cylindrical refractive erro...
17 acquired uniparental disomy aupd common featur...
18 oncogenic mutations monomeric casitas b-lineag...
19 acquired uniparental disomy aupd common featur...
20 abstract background non-small cell lung cancer...
21 oncogenic mutations monomeric casitas b-lineag...
22 oncogenic mutations monomeric casitas b-lineag...
23 recent evidence demonstrated acquired uniparen...
24 recent evidence demonstrated acquired uniparen...
25 recent evidence demonstrated acquired uniparen...
26 abstract n-myristoylation common form co-trans...
27 heterozygous mutations telomerase components t...
28 sequencing studies identified recurrent coding...
29 heterozygous mutations telomerase components t...
...
8959 using dna microarray approach screen gene copy...
8960 tumor suppressor protein p53 inactivated mutat...
8961 mutational analysis oncogenes critical underst...
8962 serine/threonine protein kinase encoded akf pr...
8963 common participation oncogenic kras proteins l...
8964 ezh2 enhancer zeste homolog 2 critical enzymat...
8965 mutations metabolic enzymes isocitrate dehydro...
8966 pancreatic carcinomas acinar differentiation ...
8967 -catenin-mediated signaling constitutively act...
8968 summary genetic abnormalities underlying hered...
8969 lung cancer leading cause cancer-related morta...
8970 summary past decade treatment lung adenocarci...
8971 transcription factor tumor suppressor protein ...
8972 protein tyrosine phosphatase receptor type d p...
8973 sensitizing activating mutations tyrosine kina...
8974 structural rearrangements chromosome 10 freque...
8975 introduction production fertile gametes essent...
8976 checkpoint kinase 2 chek2 chk2 emerges import...
8977 introduction telomere sequences chromosomal en...
8978 estimated 1 million cases breast cancer bc dia...
8979 occurring responders expression table i359l 0 ...
8980 background aims: inherited deleterious mutatio...
8981 glioblastoma multiforme gbm lethal brain tumou...
8982 diffuse large b cell lymphoma dlbcl complex di...
8983 figure largedownload s818l clones change atpas...
8984 realization late 1970s ras harboured transform...
8985 hemizygous deletions common molecular abnormal...
8986 r267w smartpool investigate 533 experiments 5q...
8987 abstract blood samples 125 unrelated families ...
8988 loss dna mismatch repair mmr humans mainly mu...
Name: Text, Length: 8989, dtype: object
tfidf_features = tfidf_vect .fit_transform (df_new ['Text' ])
print ( type (tfidf_features ), tfidf_features .get_shape () )
<class 'scipy.sparse.csr.csr_matrix'> (8989, 167304)
svd = TruncatedSVD (n_components = 500 , n_iter = 5 , random_state = 0 )
truncated_tfidf = svd .fit_transform (tfidf_features )
print ( type ( truncated_tfidf ), truncated_tfidf .shape )
<class 'numpy.ndarray'> (8989, 500)
df_tfidf_col_name = ["tfidf_" + str (i ) for i in range (500 )]
df_tfidf = pd .DataFrame ( truncated_tfidf )
df_tfidf .columns = df_tfidf_col_name
df_tfidf .tail ()
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
tfidf_0
tfidf_1
tfidf_2
tfidf_3
tfidf_4
tfidf_5
tfidf_6
tfidf_7
tfidf_8
tfidf_9
...
tfidf_490
tfidf_491
tfidf_492
tfidf_493
tfidf_494
tfidf_495
tfidf_496
tfidf_497
tfidf_498
tfidf_499
8984
0.198822
-0.050257
0.012171
-0.089914
-0.004059
-0.027756
0.009028
-0.050974
0.080609
0.033432
...
-0.017983
-0.005827
0.009936
-0.000408
0.009475
0.011130
0.003676
0.011604
-0.019133
-0.003256
8985
0.168936
-0.042271
-0.007801
-0.062719
0.020268
-0.023329
-0.006039
-0.027311
0.033846
-0.007164
...
-0.013675
0.004163
-0.009727
-0.005671
-0.007209
0.031979
0.007391
0.003575
0.033547
0.010747
8986
0.242678
-0.099755
-0.108291
0.137711
0.065636
0.127120
-0.129976
0.030754
-0.017880
-0.014541
...
-0.017030
0.006821
0.006546
-0.003729
0.009402
0.022159
-0.002369
-0.004654
-0.022835
-0.002433
8987
0.163984
-0.025351
0.004394
-0.007817
0.021132
-0.008803
-0.003844
-0.003872
-0.000813
-0.000259
...
-0.008726
-0.007982
-0.004096
-0.004657
-0.005381
0.010965
0.006184
0.001998
-0.004294
-0.003493
8988
0.169593
0.042018
0.020699
-0.015233
0.007866
-0.033753
-0.021334
0.048231
-0.030702
-0.012852
...
-0.000946
0.009195
0.008646
-0.001500
0.003134
-0.014157
0.007950
-0.001908
0.012477
-0.014073
5 rows × 500 columns
bow_vectorizer = CountVectorizer (min_df = 1 , ngram_range = (1 ,1 ))
bow_features = bow_vectorizer .fit_transform (df_new ['Text' ])
print (type (bow_features ), bow_features .get_shape ())
<class 'scipy.sparse.csr.csr_matrix'> (8989, 167304)
svd_bow = TruncatedSVD (n_components = 500 , n_iter = 5 , random_state = 0 )
truncated_bow = svd_bow .fit_transform ( bow_features )
print ( type ( truncated_bow ), truncated_bow .shape )
<class 'numpy.ndarray'> (8989, 500)
df_bow_col_name = ["bow_" + str (i ) for i in range (500 )]
df_bow = pd .DataFrame ( truncated_bow )
df_bow .columns = df_bow_col_name
df_bow .tail ()
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
bow_0
bow_1
bow_2
bow_3
bow_4
bow_5
bow_6
bow_7
bow_8
bow_9
...
bow_490
bow_491
bow_492
bow_493
bow_494
bow_495
bow_496
bow_497
bow_498
bow_499
8984
194.526113
-47.991367
-23.033461
-44.469582
12.314702
112.890376
-6.968366
-39.409591
35.309063
48.143541
...
6.041960
-1.352358
7.512079
0.280627
-1.234230
-2.138932
2.974048
2.003135
-2.488450
-3.096574
8985
69.715744
-15.101818
-9.751791
6.825362
4.400990
8.165505
-8.844331
-5.926672
14.725673
15.641581
...
1.858497
2.571454
0.303478
-3.180141
0.454055
-1.137666
0.840475
0.636456
1.383821
-0.293513
8986
57.750708
48.205872
-13.017993
-8.674400
20.067003
-23.089222
-9.090631
-15.952598
-1.722238
18.967338
...
2.248561
0.538540
1.048798
0.916919
-1.144835
0.955626
-0.159801
1.170221
2.212243
-1.639418
8987
210.185671
7.402431
9.203735
-33.907666
-30.323195
-12.265160
-18.975167
-24.648927
48.958101
-0.342035
...
-5.108713
5.970898
2.408793
2.173039
1.585976
11.776157
-3.472449
4.134649
2.737938
-5.758787
8988
69.524366
-2.637226
34.608404
-0.423025
-0.272547
-2.477183
-1.825543
2.157657
16.846462
5.286870
...
0.779397
-0.458060
-0.016754
-2.339020
-1.451003
2.258365
4.500253
0.275865
0.884756
0.543471
5 rows × 500 columns
df_dummy = df_new .iloc [:,:2 ]
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
Gene
Variation
0
FAM58A
Truncating Mutations
1
CBL
W802*
2
CBL
Q249E
3
CBL
N454D
4
CBL
L399V
5
CBL
V391I
6
CBL
V430M
7
CBL
Deletion
8
CBL
Y371H
9
CBL
C384R
10
CBL
P395A
11
CBL
K382E
12
CBL
R420Q
13
CBL
C381A
14
CBL
P428L
15
CBL
D390Y
16
CBL
Truncating Mutations
17
CBL
Q367P
18
CBL
M374V
19
CBL
Y371S
20
CBL
H94Y
21
CBL
C396R
22
CBL
G375P
23
CBL
S376F
24
CBL
P417A
25
CBL
H398Y
26
SHOC2
S2G
27
TERT
Y846C
28
TERT
C228T
29
TERT
H412Y
...
...
...
8959
VSX1
R166W
8960
MTM1
E157K
8961
D2HGDH
V444A
8962
DMD
Y231N
8963
ANKH
G389R
8964
DCX
R59L
8965
ADSL
R190Q
8966
HSD17B3
A56T
8967
CD96
T280M
8968
HPS3
R397W
8969
FKTN
R179T
8970
DARS2
L613F
8971
TP53
G245C
8972
ALG12
T67M
8973
ACOX1
Q309R
8974
CLDN19
Q57E
8975
MLH3
N499S
8976
GJB1
F235C
8977
LRP5
G171V
8978
TGFBI
R124S
8979
CYP2C9
I359L
8980
HSD17B3
M235V
8981
CAV3
A46V
8982
ABHD5
E260K
8983
NR3C2
S818L
8984
SLC46A1
R113S
8985
FOXC1
L130F
8986
GSS
R267W
8987
CTSK
G79E
8988
DFNB59
T54I
8989 rows × 2 columns
df_dummy = pd .get_dummies (df_dummy )
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
Gene_A4GALT
Gene_AAAS
Gene_AANAT
Gene_AARS
Gene_ABCA1
Gene_ABCA12
Gene_ABCA3
Gene_ABCA4
Gene_ABCB11
Gene_ABCB7
...
Variation_null380R
Variation_null399R
Variation_null420W
Variation_null423L
Variation_null462G
Variation_null483L
Variation_null496R
Variation_null522S
Variation_null654G
Variation_p61BRAF
8984
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
8985
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
8986
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
8987
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
8988
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
5 rows × 10116 columns
Combine TF-IDF Features + Bow Features + Dummy Features
df_dummy ['tmp' ] = [i for i in range (len (df_dummy ))]
df_bow ['tmp' ] = [i for i in range (len (df_bow ))]
df_tfidf ['tmp' ] = [i for i in range (len (df_tfidf ))]
df_new = df_bow .join (df_tfidf .set_index ("tmp" ), on = "tmp" )
df_new = df_new .join (df_dummy .set_index ("tmp" ), on = "tmp" )
del df_new ['tmp' ]
print ( type (df_new ), df_new .shape )
<class 'pandas.core.frame.DataFrame'> (8989, 11116)
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
Gene_A4GALT
Gene_AAAS
Gene_AANAT
Gene_AARS
Gene_ABCA1
Gene_ABCA12
Gene_ABCA3
Gene_ABCA4
Gene_ABCB11
Gene_ABCB7
...
bow_490
bow_491
bow_492
bow_493
bow_494
bow_495
bow_496
bow_497
bow_498
bow_499
8984
0
0
0
0
0
0
0
0
0
0
...
6.041960
-1.352358
7.512079
0.280627
-1.234230
-2.138932
2.974048
2.003135
-2.488450
-3.096574
8985
0
0
0
0
0
0
0
0
0
0
...
1.858497
2.571454
0.303478
-3.180141
0.454055
-1.137666
0.840475
0.636456
1.383821
-0.293513
8986
0
0
0
0
0
0
0
0
0
0
...
2.248561
0.538540
1.048798
0.916919
-1.144835
0.955626
-0.159801
1.170221
2.212243
-1.639418
8987
0
0
0
0
0
0
0
0
0
0
...
-5.108713
5.970898
2.408793
2.173039
1.585976
11.776157
-3.472449
4.134649
2.737938
-5.758787
8988
0
0
0
0
0
0
0
0
0
0
...
0.779397
-0.458060
-0.016754
-2.339020
-1.451003
2.258365
4.500253
0.275865
0.884756
0.543471
5 rows × 11116 columns
df_X = df_new .iloc [:3321 , :]
df_test = df_new .iloc [3321 :, :]
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
Gene_A4GALT
Gene_AAAS
Gene_AANAT
Gene_AARS
Gene_ABCA1
Gene_ABCA12
Gene_ABCA3
Gene_ABCA4
Gene_ABCB11
Gene_ABCB7
...
bow_490
bow_491
bow_492
bow_493
bow_494
bow_495
bow_496
bow_497
bow_498
bow_499
3316
0
0
0
0
0
0
0
0
0
0
...
1.745059
-5.821573
3.241390
-4.601446
-0.799721
0.964183
-6.654517
-7.173837
-0.191026
-3.266606
3317
0
0
0
0
0
0
0
0
0
0
...
2.279242
-2.976092
1.965532
-8.854997
3.721709
2.073133
-9.029381
-10.325801
-7.744560
-3.181727
3318
0
0
0
0
0
0
0
0
0
0
...
-2.660800
-1.165723
6.478892
0.090645
5.618230
-3.316006
-4.345822
-6.997560
1.835467
-5.563675
3319
0
0
0
0
0
0
0
0
0
0
...
-3.003526
2.542012
-2.760956
0.071038
-1.042046
-1.497018
2.630176
0.224616
6.731985
-0.477152
3320
0
0
0
0
0
0
0
0
0
0
...
-2.288296
3.421923
-2.794622
-1.133969
-4.935845
-4.073437
4.769763
1.425812
9.841368
-6.163921
5 rows × 11116 columns
train_X , test_X , train_y , test_y = train_test_split (df_X , df_y , random_state = 0 )
# Cross Validation
def model_cv (train , test , train_y , test_y , model , name ):
model .fit (train , train_y )
print (name ,': ' ,model .best_params_ )
pred_y = model .predict_proba (test )
print ('train score: {}' .format (model .score (train , train_y )))
print ('test score: {}' .format (model .score (test , test_y )))
print ('log loss: {}' .format (log_loss (test_y , pred_y )))
print ()
# Models
def forest (train , test , train_y , test_y ):
param = [{'n_estimators' :[500 ],
'max_features' : ['sqrt' ]
}]
model = GridSearchCV (RandomForestClassifier (n_jobs = - 1 , random_state = 0 ), param , cv = StratifiedKFold (random_state = 0 ))
name = 'Random forest'
return model_cv (train , test , train_y , test_y , model , name )
def xgbc (train , test , train_y , test_y ):
param = [{'n_estimators' : [300 ],
'learning_rate' : [0.05 ],}]
model = GridSearchCV (XGBClassifier (), param , cv = StratifiedKFold (random_state = 0 ))
name = 'XGBoost'
return model_cv (train , test , train_y , test_y , model , name )
def lgbm (train , test , train_y , test_y ):
param = [{'n_estimators' : [100 ],
'learning_rate' : [0.05 ]}]
model = GridSearchCV (LGBMClassifier (), param , cv = 3 )
name = 'LightGBM'
return model_cv (train , test , train_y , test_y , model , name )
forest (train_X , test_X , train_y , test_y )
Random forest : {'max_features': 'sqrt', 'n_estimators': 500}
train score: 1.0
test score: 0.6293622141997594
log loss: 1.4546850488576162
lgbm (train_X , test_X , train_y , test_y )
LightGBM : {'learning_rate': 0.05, 'n_estimators': 100}
train score: 0.9080321285140562
test score: 0.6702767749699158
log loss: 0.9912064764789812
# xgbc(train_X, test_X, train_y, test_y)
# we select model lgbm for our task
lgbm = LGBMClassifier (learning_rate = 0.05 , n_estimators = 100 )
lgbm .fit (train_X , train_y )
LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.05,
max_bin=255, max_depth=-1, min_child_samples=10,
min_child_weight=5, min_split_gain=0, n_estimators=100, nthread=-1,
num_leaves=31, objective='multiclass', reg_alpha=0, reg_lambda=0,
seed=0, silent=True, subsample=1, subsample_for_bin=50000,
subsample_freq=1)
pred = lgbm .predict (df_test )
pred_pro = lgbm .predict_proba (df_test )
print ( type ( pred ), pred .shape , pred_pro .shape )
<class 'numpy.ndarray'> (5668,) (5668, 9)
array([7, 4, 7, ..., 2, 7, 4])
from sklearn import preprocessing
lb = preprocessing .LabelBinarizer ()
lb .fit (pred )
LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
pred = lb .transform (pred )
print ( type ( pred ), pred .shape )
<class 'numpy.ndarray'> (5668, 9)
pred = pd .DataFrame (pred )
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
0
1
2
3
4
5
6
7
8
5663
0
0
0
0
0
0
1
0
0
5664
0
0
0
0
0
0
1
0
0
5665
0
1
0
0
0
0
0
0
0
5666
0
0
0
0
0
0
1
0
0
5667
0
0
0
1
0
0
0
0
0
submit = pd .DataFrame (pred )
# submit.tail()
ID = pd .DataFrame ([{"ID" : i } for i in range (len (submit )) ])
submit .columns = ['class1' , 'class2' , 'class3' , 'class4' , 'class5' , 'class6' , 'class7' , 'class8' , 'class9' ]
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
class1
class2
class3
class4
class5
class6
class7
class8
class9
5663
0
0
0
0
0
0
1
0
0
5664
0
0
0
0
0
0
1
0
0
5665
0
1
0
0
0
0
0
0
0
5666
0
0
0
0
0
0
1
0
0
5667
0
0
0
1
0
0
0
0
0
# ID
submit = pd .concat ([ID , submit ], axis = 1 )
submit .tail ()
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
ID
class1
class2
class3
class4
class5
class6
class7
class8
class9
5663
5663
0
0
0
0
0
0
1
0
0
5664
5664
0
0
0
0
0
0
1
0
0
5665
5665
0
1
0
0
0
0
0
0
0
5666
5666
0
0
0
0
0
0
1
0
0
5667
5667
0
0
0
1
0
0
0
0
0
submit .to_csv ('data/submit_xgbc.csv' , index = False )