Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 7c4cbac5 authored by Vítor Vieira's avatar Vítor Vieira
Browse files

[ADD] Prediction code and first submission

parent 416b7733
This diff is collapsed.
from src.get_molecules import *
import re
from src import evaluation_metrics
from src.data_preprocessing import *
import pandas as pd
import numpy as np
import re
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer
from inspect import getmembers, isfunction
test_template_path = 'dream/round_1_template.csv'
template_path = 'dream/round_1_template.csv' # done
dom_dataframe_path = 'generated/domain_sequences.csv' # done
interactions_path = 'generated/training_df.csv' # done
acc_to_identifiers_path = 'generated/accession_to_identifiers_df.csv' # done
uniprot_json_path = 'generated/uniprot_df.json' # nvm
domseq_to_uniprot_path = 'generated/domain_sequence_to_uniprot_mapping.csv' # nvm
multiple_alignf_path = 'generated/multiple_align_features.csv'
protein_features_path = 'generated/protein_features.csv'
protein_data_path = 'generated/TRAIN_protein_data.csv'
intrx_compound_path = 'generated/TRAIN_intrx_compound_data.csv'
def get_chemical_mapping_df(chembl_map_path='data/chembl_24_1_chemreps.txt.gz', index_by='standard_inchi_key'):
return pd.read_csv(chembl_map_path, sep='\t').set_index(index_by, drop=True)
......@@ -126,4 +157,133 @@ def generate_data(acc_to_identifiers_path, multiple_alignf_path, domseq_to_unipr
final_protein_df.to_csv('generated/TRAIN_protein_data.csv')
interaction_data.to_csv('generated/TRAIN_intrx_data.csv')
compound_df.to_csv('generated/TRAIN_compound_df.csv')
return interaction_data, compound_df, final_protein_df
\ No newline at end of file
return interaction_data, compound_df, final_protein_df
def load_features():
## TODO: Deal with MemoryError
#final_protein_df = pd.read_csv(protein_data_path, index_col = 0)
protein_features_to_drop = [
'APAAC',
'Moran',
'Geary',
'Tripeptide'
]
interaction_data, compound_df, final_protein_df = generate_data(
acc_to_identifiers_path,
multiple_alignf_path,
domseq_to_uniprot_path,
interactions_path,
protein_features_to_drop,
protein_features_path,
mol_descriptors=['Morgan','MACCS','MolLogP'],
write=True)
return interaction_data, compound_df, final_protein_df
def load_test_data():
## TODO: Needs to be implemented
template = pd.read_csv(template_path)
template
test_compound_df = get_compound_feature_dataframe(template, ['Morgan','MACCS','MolLogP'], 'Compound_SMILES')
test_protein_df = final_protein_df.loc[template.UniProt_Id]
template_w_compounds = template.set_index('Compound_SMILES').join(test_compound_df).reset_index()
template_w_all = template_w_compounds.set_index('UniProt_Id').join(test_protein_df).reset_index()
return template_w_all
def make_scorers():
functions_list = getmembers(evaluation_metrics, isfunction)
scorer_list = []
for metric_name, func in functions_list:
if metric_name == 'rmse':
scorer_list.append(make_scorer(func, greater_is_better=False))
else:
scorer_list.append(make_scorer(func, greater_is_better=True))
return scorer_list
def load_training_data():
interaction_data, compound_df, final_protein_df = load_features()
different_features = final_protein_df.apply(lambda x: len(x.value_counts()) > 1)
final_protein_df_1 = final_protein_df.loc[:,different_features]
p = 0.01
vt = VarianceThreshold(threshold=0.01)
ppl = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')), ('vt',vt)])
ppl.fit(final_protein_df_1)
final_protein_df_2 = final_protein_df_1.loc[:,ppl.steps[-1][1].get_support()]
df_w_compounds = interaction_data.set_index('canonical_smiles').join(compound_df).reset_index()
df_w_all = df_w_compounds.set_index('target_id').join(final_protein_df_2).reset_index()
## TODO: Remove compound and protein indexes (smiles/uniprot)
return df_w_all
#
# template = pd.read_csv(template_path)
# interaction_data = add_chemical_identifier_to_df(preprocess_interaction_data(interactions_path))
#
# sub_int_data = interaction_data[interaction_data.target_id.isin(template.UniProt_Id)]
# sub_int_data.target_id.value_counts()
# comp_df = get_compound_feature_dataframe(template, ['Morgan','MACCS','MolLogP'], 'Compound_SMILES')
#
# (template.UniProt_Id == 'P00519').sum()
# test_data = load_test_data()
#
# X = X[X['level_0'].isin(template.UniProt_Id)]
#
# len((X['level_0'] + X['index']).unique())
def get_training_df(df, uniprot_filter):
df_w_all = (df[df['level_0'].isin(uniprot_filter)]).drop(columns=['level_0', 'index', 'standard_inchi_key'])
X_train, y_train = df_w_all.drop(columns=['standard_value']), df_w_all['standard_value']
numeric_features, binary_features = [], []
for i,col in enumerate(X_train.columns):
if df_w_all[col].dtype == np.float:
numeric_features.append(i)
elif df_w_all[col].dtype == np.int and len(df_w_all[col].value_counts().index) == 2:
binary_features.append(i)
return X_train, y_train, numeric_features, binary_features
if __name__ == '__main__':
X = load_training_data()
# X_train, y_train, numeric_features, binary_features = get_training_df(X)
#
# X_train.to_csv('generated/final_X_train.csv')
# y_train.to_csv('generated/final_y_train.csv')
#
# with open('generated/final_x_numerical_features.pkl', 'wb') as f:
# pickle.dump(numeric_features, f)
#
# with open('generated/final_x_binary_features.pkl', 'wb') as f:
# pickle.dump(binary_features, f)
template = pd.read_csv('dream/round_1_template.csv')
X_train, y_train, numeric_features, binary_features = get_training_df(X, template.UniProt_Id)
X_train.to_csv('generated/final_X_train_subset.csv')
y_train.to_csv('generated/final_y_train_subset.csv')
import pickle
with open('generated/final_x_numerical_features_subset.pkl', 'wb') as f:
pickle.dump(numeric_features, f)
with open('generated/final_x_binary_features_subset.pkl', 'wb') as f:
pickle.dump(binary_features, f)
......@@ -105,7 +105,7 @@ def f1(y, f):
"""
y_binary = copy.deepcopy(y)
y_binary = preprocessing.binarize(y_binary.reshape(1, abs-1),
y_binary = preprocessing.binarize(y_binary.reshape(1, -1),
threshold=7.0,
copy=False)[0]
f_binary = copy.deepcopy(f)
......
......@@ -43,7 +43,8 @@ def load_features():
interactions_path,
protein_features_to_drop,
protein_features_path,
mol_descriptors=['Morgan','MACCS','MolLogP'])
mol_descriptors=['Morgan','MACCS','MolLogP'],
write=True)
return interaction_data, compound_df, final_protein_df
......@@ -100,7 +101,6 @@ def load_training_data():
# sub_int_data.target_id.value_counts()
# comp_df = get_compound_feature_dataframe(template, ['Morgan','MACCS','MolLogP'], 'Compound_SMILES')
#
X = load_training_data()
# (template.UniProt_Id == 'P00519').sum()
# test_data = load_test_data()
#
......@@ -108,8 +108,9 @@ X = load_training_data()
#
# len((X['level_0'] + X['index']).unique())
def get_training_df(df):
df_w_all = df.drop(columns=['level_0', 'index', 'standard_inchi_key'])
def get_training_df(df, uniprot_filter):
df_w_all = (df[df['level_0'].isin(uniprot_filter)]).drop(columns=['level_0', 'index', 'standard_inchi_key'])
X_train, y_train = df_w_all.drop(columns=['standard_value']), df_w_all['standard_value']
......@@ -122,18 +123,34 @@ def get_training_df(df):
elif df_w_all[col].dtype == np.int and len(df_w_all[col].value_counts().index) == 2:
binary_features.append(i)
return X_train, y_train, numeric_features, binary_features
X_train, y_train, numeric_features, binary_features = get_training_df(X)
X = load_training_data()
# X_train, y_train, numeric_features, binary_features = get_training_df(X)
#
# X_train.to_csv('generated/final_X_train.csv')
# y_train.to_csv('generated/final_y_train.csv')
#
# with open('generated/final_x_numerical_features.pkl', 'wb') as f:
# pickle.dump(numeric_features, f)
#
# with open('generated/final_x_binary_features.pkl', 'wb') as f:
# pickle.dump(binary_features, f)
template = pd.read_csv('dream/round_1_template.csv')
X_train.to_csv('generated/final_X_train.csv')
X_train, y_train, numeric_features, binary_features = get_training_df(X, template.UniProt_Id)
y_train.to_csv('generated/final_y_train.csv')
X_train.to_csv('generated/final_X_train_subset.csv')
y_train.to_csv('generated/final_y_train_subset.csv')
with open('generated/final_x_numerical_features.pkl', 'wb') as f:
import pickle
with open('generated/final_x_numerical_features_subset.pkl', 'wb') as f:
pickle.dump(numeric_features, f)
with open('generated/final_x_binary_features.pkl', 'wb') as f:
with open('generated/final_x_binary_features_subset.pkl', 'wb') as f:
pickle.dump(binary_features, f)
......
from src import evaluation_metrics
from src.data_preprocessing import *
import pandas as pd
import numpy as np
import re
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.ensemble import BaggingRegressor
from sklearn.compose import ColumnTransformer
from inspect import getmembers, isfunction
import pickle
from src import evaluation_metrics
def read_pickle(fpath):
with open(fpath, 'rb') as f:
obj = pickle.load(f)
return obj
X_train = pd.read_csv('generated/final_X_train_subset.csv', index_col=0)
cols_to_use = X_train.columns
X_train = X_train.values
y_train = pd.read_csv('generated/final_y_train_subset.csv', header=None, index_col=0)
y_train_final = -np.log10(y_train/(10**9)).values.ravel()
def make_scorers():
functions_list = getmembers(evaluation_metrics, isfunction)
scorer_list = []
for metric_name, func in functions_list:
if metric_name == 'rmse':
scorer_list.append((metric_name, make_scorer(func, greater_is_better=False)))
else:
scorer_list.append((metric_name, make_scorer(func, greater_is_better=True)))
return dict(scorer_list)
numeric_features, binary_features = read_pickle('generated/final_x_numerical_features_subset.pkl'), read_pickle('generated/final_x_binary_features_subset.pkl')
#
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
# categorical_features =
#categorical_transformer = Pipeline(steps=['imputer', SimpleImputer(strategy='most_frequent'), ('onehot', OneHotEncoder())])
#binary_features =
binary_transformer = SimpleImputer(strategy='most_frequent')
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
# ('cat', categorical_transformer, categorical_features),
('bin', binary_transformer, binary_features)])
selector = SelectKBest(k=50) # change k; use another method?
estimator = SVR() # need to change this easily when testing though
pipe = Pipeline(steps=[('feature_selector', selector)])
# ('estimator', estimator)])
X_train_select = pipe.fit_transform(X_train, y_train_final)
estimator = BaggingRegressor(SVR(), max_samples=0.4, max_features=1)
scorer_list = make_scorers()
cv_results = cross_validate(estimator, X_train_select, y_train_final, cv=5, scoring=scorer_list, verbose=4, n_jobs=5)
for res in cv_results:
print(res+':\n')
print(cv_results[res])
estimator.fit(X_train, y_train_final)
import pickle
with open('dream/models/SVR_C1_500ftrs.pkl','wb') as f:
pickle.dump(estimator, f)
with open('dream/models/500ftrs_pipeline.pkl','wb') as f:
pickle.dump(pipe, f)
def interaction_to_index_map(df):
d = {}
for index, row in zip(df.index, df[['Compound_SMILES', 'UniProt_Id']].values):
if tuple(row) not in d:
d[tuple(row)] = [index]
else:
d[tuple(row)].append(index)
return d
#estimator.fit(X_train_select, y_train_final)
## predictions
estimator = read_pickle('dream/models/SVR_C1_500ftrs.pkl')
selector = read_pickle('dream/models/500ftrs_pipeline.pkl')
template_compound_df = get_compound_feature_dataframe(template, ['Morgan','MACCS','MolLogP'], 'Compound_SMILES')
template_protein_df = pd.read_csv('generated/TRAIN_protein_data.csv', index_col=0)
template_with_comp = template[['Compound_SMILES','UniProt_Id']].set_index('Compound_SMILES',drop=False).join(template_compound_df)
template_full = template_with_comp.set_index('UniProt_Id', drop=False).join(template_protein_df).reset_index(drop=True)
index_map = interaction_to_index_map(template_full)
template_full = template_full.set_index(['Compound_SMILES','UniProt_Id'])[cols_to_use]
pipe = Pipeline(steps = [('impute',SimpleImputer(strategy='mean')),
('select',SelectKBest(k=500))])
X_train_select = pipe.fit_transform(X_train, y_train_final)
X_test_select = pipe.transform(template_full.values)
preds = estimator.predict(X_test_select)
y_test = []
for i in range(template.shape[0]):
comp = template['Compound_SMILES'].iloc[i]
prot = template['UniProt_Id'].iloc[i]
pred_index = index_map[(comp, prot)]
if len(pred_index) > 1:
pred = (preds[pred_index]/len(pred_index))[0]
else:
pred = preds[pred_index][0]
y_test.append(pred)
prediction_to_submit = template.copy()
prediction_to_submit['pKd_[M]_pred'] = y_test
prediction_to_submit.to_csv('dream/round1_t1/SVR_C1_500ftrs.csv', index=None)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment