beyondacm/Kaggle_Personalized_Medicine

Solutions for the Kaggle: PMRCT

Personalized Medicine : Redefining Cancer Treatment
Predict the effect of Genetic Variants to enable Personalized Medicine

Loading Library

import numpy as np  # Linear Algebra
import pandas as pd # Data processing
from pandas import HDFStore
import matplotlib.pyplot as plt # Data Visualization
%matplotlib inline
import seaborn as sns # Visualization
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import cross_val_predict, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import log_loss, accuracy_score
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import re
import nltk

Data Importing & Preprocessing

train_text_df = pd.read_csv('data/training_text', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
test_text_df  = pd.read_csv('data/test_text', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
train_vari_df = pd.read_csv('data/training_variants')
test_vari_df  = pd.read_csv('data/test_variants')

train_text_df.head()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	ID	Text
0	0	Cyclin-dependent kinases (CDKs) regulate a var...
1	1	Abstract Background Non-small cell lung canc...
2	2	Abstract Background Non-small cell lung canc...
3	3	Recent evidence has demonstrated that acquired...
4	4	Oncogenic mutations in the monomeric Casitas B...

train_vari_df.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	ID	Gene	Variation	Class
3316	3316	RUNX1	D171N	4
3317	3317	RUNX1	A122*	1
3318	3318	RUNX1	Fusions	1
3319	3319	RUNX1	R80C	4
3320	3320	RUNX1	K83E	4

df = train_vari_df.join(train_text_df.set_index('ID'), on='ID')

df_y = df['Class'].values
df_X = df[['Gene', 'Variation', 'Text']]

print( type(df_y) )
print( type(df_X) )

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>

df_test = test_vari_df.join(test_text_df.set_index('ID'), on='ID')
df_test = df_test.iloc[:,1:]
df_new = pd.concat([df_X, df_test], ignore_index=True)

print( type(df_new), df_new.shape )

<class 'pandas.core.frame.DataFrame'> (8989, 3)

df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8989 entries, 0 to 8988
Data columns (total 3 columns):
Gene         8989 non-null object
Variation    8989 non-null object
Text         8989 non-null object
dtypes: object(3)
memory usage: 210.8+ KB

Extracting Features

TF-IDF Features

tfidf_vect = TfidfVectorizer()
stop_words = ENGLISH_STOP_WORDS

def text_Decomposition(text):
    text = re.sub(r"[^a-zA-Z0-9^,!./\+-_=]", " ", text)
    text = text.lower().split()
    text = [i for i in text if not i in stop_words]
    text = " ".join(text)
    text = text.replace("."," ").replace(","," ")
    return (text)

df_new['Text'] = df_new['Text'].apply(text_Decomposition)

df_new['Text']

0       cyclin-dependent kinases cdks regulate variety...
1       abstract background non-small cell lung cancer...
2       abstract background non-small cell lung cancer...
3       recent evidence demonstrated acquired uniparen...
4       oncogenic mutations monomeric casitas b-lineag...
5       oncogenic mutations monomeric casitas b-lineag...
6       oncogenic mutations monomeric casitas b-lineag...
7       cbl negative regulator activated receptor tyro...
8       abstract juvenile myelomonocytic leukemia jmml...
9       abstract juvenile myelomonocytic leukemia jmml...
10      oncogenic mutations monomeric casitas b-lineag...
11      noonan syndrome autosomal dominant congenital ...
12      noonan syndrome autosomal dominant congenital ...
13      noonan syndrome autosomal dominant congenital ...
14      oncogenic mutations monomeric casitas b-lineag...
15      noonan syndrome autosomal dominant congenital ...
16      determine residual cylindrical refractive erro...
17      acquired uniparental disomy aupd common featur...
18      oncogenic mutations monomeric casitas b-lineag...
19      acquired uniparental disomy aupd common featur...
20      abstract background non-small cell lung cancer...
21      oncogenic mutations monomeric casitas b-lineag...
22      oncogenic mutations monomeric casitas b-lineag...
23      recent evidence demonstrated acquired uniparen...
24      recent evidence demonstrated acquired uniparen...
25      recent evidence demonstrated acquired uniparen...
26      abstract n-myristoylation common form co-trans...
27      heterozygous mutations telomerase components t...
28      sequencing studies identified recurrent coding...
29      heterozygous mutations telomerase components t...
                              ...                        
8959    using dna microarray approach screen gene copy...
8960    tumor suppressor protein p53 inactivated mutat...
8961    mutational analysis oncogenes critical underst...
8962    serine/threonine protein kinase encoded akf pr...
8963    common participation oncogenic kras proteins l...
8964    ezh2 enhancer zeste homolog 2 critical enzymat...
8965    mutations metabolic enzymes isocitrate dehydro...
8966    pancreatic carcinomas acinar differentiation  ...
8967    -catenin-mediated signaling constitutively act...
8968    summary genetic abnormalities underlying hered...
8969    lung cancer leading cause cancer-related morta...
8970    summary past decade  treatment lung adenocarci...
8971    transcription factor tumor suppressor protein ...
8972    protein tyrosine phosphatase receptor type d p...
8973    sensitizing activating mutations tyrosine kina...
8974    structural rearrangements chromosome 10 freque...
8975    introduction production fertile gametes essent...
8976    checkpoint kinase 2 chek2  chk2 emerges import...
8977    introduction telomere sequences chromosomal en...
8978    estimated 1 million cases breast cancer bc dia...
8979    occurring responders expression table i359l 0 ...
8980    background aims: inherited deleterious mutatio...
8981    glioblastoma multiforme gbm lethal brain tumou...
8982    diffuse large b cell lymphoma dlbcl complex di...
8983    figure largedownload s818l clones change atpas...
8984    realization late 1970s ras harboured transform...
8985    hemizygous deletions common molecular abnormal...
8986    r267w smartpool investigate 533 experiments 5q...
8987    abstract blood samples 125 unrelated families ...
8988    loss dna mismatch repair mmr humans  mainly mu...
Name: Text, Length: 8989, dtype: object

tfidf_features = tfidf_vect.fit_transform(df_new['Text'])

print( type(tfidf_features), tfidf_features.get_shape() )

<class 'scipy.sparse.csr.csr_matrix'> (8989, 167304)

svd = TruncatedSVD(n_components=500, n_iter=5, random_state=0)

truncated_tfidf = svd.fit_transform(tfidf_features)

print( type( truncated_tfidf ), truncated_tfidf.shape )

<class 'numpy.ndarray'> (8989, 500)

df_tfidf_col_name = ["tfidf_"+str(i) for i in range(500)]

df_tfidf = pd.DataFrame( truncated_tfidf )
df_tfidf.columns = df_tfidf_col_name
df_tfidf.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	tfidf_0	tfidf_1	tfidf_2	tfidf_3	tfidf_4	tfidf_5	tfidf_6	tfidf_7	tfidf_8	tfidf_9	...	tfidf_490	tfidf_491	tfidf_492	tfidf_493	tfidf_494	tfidf_495	tfidf_496	tfidf_497	tfidf_498	tfidf_499
8984	0.198822	-0.050257	0.012171	-0.089914	-0.004059	-0.027756	0.009028	-0.050974	0.080609	0.033432	...	-0.017983	-0.005827	0.009936	-0.000408	0.009475	0.011130	0.003676	0.011604	-0.019133	-0.003256
8985	0.168936	-0.042271	-0.007801	-0.062719	0.020268	-0.023329	-0.006039	-0.027311	0.033846	-0.007164	...	-0.013675	0.004163	-0.009727	-0.005671	-0.007209	0.031979	0.007391	0.003575	0.033547	0.010747
8986	0.242678	-0.099755	-0.108291	0.137711	0.065636	0.127120	-0.129976	0.030754	-0.017880	-0.014541	...	-0.017030	0.006821	0.006546	-0.003729	0.009402	0.022159	-0.002369	-0.004654	-0.022835	-0.002433
8987	0.163984	-0.025351	0.004394	-0.007817	0.021132	-0.008803	-0.003844	-0.003872	-0.000813	-0.000259	...	-0.008726	-0.007982	-0.004096	-0.004657	-0.005381	0.010965	0.006184	0.001998	-0.004294	-0.003493
8988	0.169593	0.042018	0.020699	-0.015233	0.007866	-0.033753	-0.021334	0.048231	-0.030702	-0.012852	...	-0.000946	0.009195	0.008646	-0.001500	0.003134	-0.014157	0.007950	-0.001908	0.012477	-0.014073

5 rows × 500 columns

Bag of words Features

bow_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
bow_features = bow_vectorizer.fit_transform(df_new['Text'])

print(type(bow_features), bow_features.get_shape())

<class 'scipy.sparse.csr.csr_matrix'> (8989, 167304)

svd_bow = TruncatedSVD(n_components=500, n_iter=5, random_state=0)
truncated_bow = svd_bow.fit_transform( bow_features )

print( type( truncated_bow ), truncated_bow.shape )

<class 'numpy.ndarray'> (8989, 500)

df_bow_col_name = ["bow_"+str(i) for i in range(500)]
df_bow = pd.DataFrame( truncated_bow )
df_bow.columns = df_bow_col_name
df_bow.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	bow_0	bow_1	bow_2	bow_3	bow_4	bow_5	bow_6	bow_7	bow_8	bow_9	...	bow_490	bow_491	bow_492	bow_493	bow_494	bow_495	bow_496	bow_497	bow_498	bow_499
8984	194.526113	-47.991367	-23.033461	-44.469582	12.314702	112.890376	-6.968366	-39.409591	35.309063	48.143541	...	6.041960	-1.352358	7.512079	0.280627	-1.234230	-2.138932	2.974048	2.003135	-2.488450	-3.096574
8985	69.715744	-15.101818	-9.751791	6.825362	4.400990	8.165505	-8.844331	-5.926672	14.725673	15.641581	...	1.858497	2.571454	0.303478	-3.180141	0.454055	-1.137666	0.840475	0.636456	1.383821	-0.293513
8986	57.750708	48.205872	-13.017993	-8.674400	20.067003	-23.089222	-9.090631	-15.952598	-1.722238	18.967338	...	2.248561	0.538540	1.048798	0.916919	-1.144835	0.955626	-0.159801	1.170221	2.212243	-1.639418
8987	210.185671	7.402431	9.203735	-33.907666	-30.323195	-12.265160	-18.975167	-24.648927	48.958101	-0.342035	...	-5.108713	5.970898	2.408793	2.173039	1.585976	11.776157	-3.472449	4.134649	2.737938	-5.758787
8988	69.524366	-2.637226	34.608404	-0.423025	-0.272547	-2.477183	-1.825543	2.157657	16.846462	5.286870	...	0.779397	-0.458060	-0.016754	-2.339020	-1.451003	2.258365	4.500253	0.275865	0.884756	0.543471

5 rows × 500 columns

Dummy Features

df_dummy = df_new.iloc[:,:2]

df_dummy

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	Gene	Variation
0	FAM58A	Truncating Mutations
1	CBL	W802*
2	CBL	Q249E
3	CBL	N454D
4	CBL	L399V
5	CBL	V391I
6	CBL	V430M
7	CBL	Deletion
8	CBL	Y371H
9	CBL	C384R
10	CBL	P395A
11	CBL	K382E
12	CBL	R420Q
13	CBL	C381A
14	CBL	P428L
15	CBL	D390Y
16	CBL	Truncating Mutations
17	CBL	Q367P
18	CBL	M374V
19	CBL	Y371S
20	CBL	H94Y
21	CBL	C396R
22	CBL	G375P
23	CBL	S376F
24	CBL	P417A
25	CBL	H398Y
26	SHOC2	S2G
27	TERT	Y846C
28	TERT	C228T
29	TERT	H412Y
...	...	...
8959	VSX1	R166W
8960	MTM1	E157K
8961	D2HGDH	V444A
8962	DMD	Y231N
8963	ANKH	G389R
8964	DCX	R59L
8965	ADSL	R190Q
8966	HSD17B3	A56T
8967	CD96	T280M
8968	HPS3	R397W
8969	FKTN	R179T
8970	DARS2	L613F
8971	TP53	G245C
8972	ALG12	T67M
8973	ACOX1	Q309R
8974	CLDN19	Q57E
8975	MLH3	N499S
8976	GJB1	F235C
8977	LRP5	G171V
8978	TGFBI	R124S
8979	CYP2C9	I359L
8980	HSD17B3	M235V
8981	CAV3	A46V
8982	ABHD5	E260K
8983	NR3C2	S818L
8984	SLC46A1	R113S
8985	FOXC1	L130F
8986	GSS	R267W
8987	CTSK	G79E
8988	DFNB59	T54I

8989 rows × 2 columns

df_dummy = pd.get_dummies(df_dummy)

df_dummy.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	Gene_A4GALT	Gene_AAAS	Gene_AANAT	Gene_AARS	Gene_ABCA1	Gene_ABCA12	Gene_ABCA3	Gene_ABCA4	Gene_ABCB11	Gene_ABCB7	...	Variation_null380R	Variation_null399R	Variation_null420W	Variation_null423L	Variation_null462G	Variation_null483L	Variation_null496R	Variation_null522S	Variation_null654G	Variation_p61BRAF
8984	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
8985	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
8986	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
8987	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
8988	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

5 rows × 10116 columns

Combine TF-IDF Features + Bow Features + Dummy Features

df_dummy['tmp'] = [i for i in range(len(df_dummy))]
df_bow['tmp']   = [i for i in range(len(df_bow))]
df_tfidf['tmp'] = [i for i in range(len(df_tfidf))]

df_new = df_bow.join(df_tfidf.set_index("tmp"), on="tmp")
df_new = df_new.join(df_dummy.set_index("tmp"), on="tmp")
del df_new['tmp']

print( type(df_new), df_new.shape )

<class 'pandas.core.frame.DataFrame'> (8989, 11116)

df_new.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	...	bow_490	bow_491	bow_492	bow_493	bow_494	bow_495	bow_496	bow_497	bow_498	bow_499
8984	...	6.041960	-1.352358	7.512079	0.280627	-1.234230	-2.138932	2.974048	2.003135	-2.488450	-3.096574
8985	...	1.858497	2.571454	0.303478	-3.180141	0.454055	-1.137666	0.840475	0.636456	1.383821	-0.293513
8986	...	2.248561	0.538540	1.048798	0.916919	-1.144835	0.955626	-0.159801	1.170221	2.212243	-1.639418
8987	...	-5.108713	5.970898	2.408793	2.173039	1.585976	11.776157	-3.472449	4.134649	2.737938	-5.758787
8988	...	0.779397	-0.458060	-0.016754	-2.339020	-1.451003	2.258365	4.500253	0.275865	0.884756	0.543471

5 rows × 11116 columns

df_X     = df_new.iloc[:3321,  :]
df_test  = df_new.iloc[3321:,  :]

df_X.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	...	bow_490	bow_491	bow_492	bow_493	bow_494	bow_495	bow_496	bow_497	bow_498	bow_499
3316	...	1.745059	-5.821573	3.241390	-4.601446	-0.799721	0.964183	-6.654517	-7.173837	-0.191026	-3.266606
3317	...	2.279242	-2.976092	1.965532	-8.854997	3.721709	2.073133	-9.029381	-10.325801	-7.744560	-3.181727
3318	...	-2.660800	-1.165723	6.478892	0.090645	5.618230	-3.316006	-4.345822	-6.997560	1.835467	-5.563675
3319	...	-3.003526	2.542012	-2.760956	0.071038	-1.042046	-1.497018	2.630176	0.224616	6.731985	-0.477152
3320	...	-2.288296	3.421923	-2.794622	-1.133969	-4.935845	-4.073437	4.769763	1.425812	9.841368	-6.163921

5 rows × 11116 columns

train_X, test_X, train_y, test_y = train_test_split(df_X, df_y, random_state=0)

Model Building

# Cross Validation
def model_cv(train, test, train_y, test_y, model, name):
    model.fit(train, train_y)
    print(name,': ',model.best_params_)
    pred_y = model.predict_proba(test)
    print('train score: {}'.format(model.score(train, train_y)))
    print('test score: {}'.format(model.score(test, test_y)))
    print('log loss: {}'.format(log_loss(test_y, pred_y)))
    print()

# Models
def forest(train, test, train_y, test_y):
    param = [{'n_estimators':[500],
              'max_features': ['sqrt']
         }]
    model = GridSearchCV(RandomForestClassifier(n_jobs=-1, random_state=0), param, cv=StratifiedKFold(random_state=0))
    name = 'Random forest'
    return model_cv(train, test, train_y, test_y, model, name)

def xgbc(train, test, train_y, test_y):
    param = [{'n_estimators': [300],
         'learning_rate': [0.05],}]
    model = GridSearchCV(XGBClassifier(), param, cv=StratifiedKFold(random_state=0))
    name = 'XGBoost'
    return model_cv(train, test, train_y, test_y, model, name)

def lgbm(train, test, train_y, test_y):
    param = [{'n_estimators': [100],
         'learning_rate': [0.05]}]
    model = GridSearchCV(LGBMClassifier(), param, cv=3)
    name = 'LightGBM'
    return model_cv(train, test, train_y, test_y, model, name)

forest(train_X, test_X, train_y, test_y)

Random forest :  {'max_features': 'sqrt', 'n_estimators': 500}
train score: 1.0
test score: 0.6293622141997594
log loss: 1.4546850488576162

lgbm(train_X, test_X, train_y, test_y)

LightGBM :  {'learning_rate': 0.05, 'n_estimators': 100}
train score: 0.9080321285140562
test score: 0.6702767749699158
log loss: 0.9912064764789812

# xgbc(train_X, test_X, train_y, test_y)

# we select model lgbm for our task
lgbm = LGBMClassifier(learning_rate=0.05, n_estimators=100)

lgbm.fit(train_X, train_y)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.05,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=100, nthread=-1,
        num_leaves=31, objective='multiclass', reg_alpha=0, reg_lambda=0,
        seed=0, silent=True, subsample=1, subsample_for_bin=50000,
        subsample_freq=1)

Submit

pred = lgbm.predict(df_test)
pred_pro = lgbm.predict_proba(df_test)

print( type( pred ), pred.shape, pred_pro.shape )

<class 'numpy.ndarray'> (5668,) (5668, 9)

pred

array([7, 4, 7, ..., 2, 7, 4])

from sklearn import preprocessing

lb = preprocessing.LabelBinarizer()
lb.fit(pred)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

lb.classes_

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

pred = lb.transform(pred)

print( type( pred ), pred.shape )

<class 'numpy.ndarray'> (5668, 9)

pred = pd.DataFrame(pred)

pred.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	1	3	6
5663	0	0	1
5664	0	0	1
5665	1	0	0
5666	0	0	1
5667	0	1	0

submit = pd.DataFrame(pred)

len(submit)

# submit.tail()
ID = pd.DataFrame([{"ID": i} for i in range(len(submit)) ])
submit.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9']

submit.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	class2	class4	class7
5663	0	0	1
5664	0	0	1
5665	1	0	0
5666	0	0	1
5667	0	1	0

# ID
submit = pd.concat([ID, submit], axis=1)
submit.tail()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	ID	class2	class4	class7
5663	5663	0	0	1
5664	5664	0	0	1
5665	5665	1	0	0
5666	5666	0	0	1
5667	5667	0	1	0

submit.to_csv('data/submit_xgbc.csv', index=False)

beyondacm / Kaggle_Personalized_Medicine

Solutions for the Kaggle: PMRCT

Loading Library

Data Importing & Preprocessing

Extracting Features

TF-IDF Features

Bag of words Features

Dummy Features

Combine TF-IDF Features + Bow Features + Dummy Features

Model Building

Submit

About

Languages