Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at

Commit ba102a71 authored by Delora Baptista's avatar Delora Baptista
Browse files

[ADD] code for deep learning models (drug graph conv + prot Conv1D model)

parent 1532c936
from __future__ import division, print_function, absolute_import # original code was written in Python 2.x
import ast
from keras.models import Model
from keras.layers import Input, Dense, Dropout, BatchNormalization, concatenate, add, Conv1D, GlobalMaxPooling1D, Embedding
from keras.optimizers import Adam, SGD, RMSprop
from keras.regularizers import l1_l2
from NGF.layers import NeuralGraphHidden, NeuralGraphOutput, NeuralGraphPool, AtomwiseDropout
from NGF.utils import zip_mixed, is_iterable
from keras.wrappers.scikit_learn import BaseWrapper
import numpy as np
import copy
from keras.utils.np_utils import to_categorical
from keras.utils.generic_utils import has_arg
class DrugProtConvModel(BaseWrapper):
def check_params(self, params):
"""Checks for user typos in `params`.
# Arguments
params: dictionary; the parameters to be checked
# Raises
ValueError: if any member of `params` is not a valid argument.
legal_params_fns = [, Model.predict, Model.evaluate]
# if self.build_fn is None:
# legal_params_fns.append(self.__call__)
# elif (not isinstance(self.build_fn, types.FunctionType) and
# not isinstance(self.build_fn, types.MethodType)):
# legal_params_fns.append(self.build_fn.__call__)
# else:
# legal_params_fns.append(self.build_fn)
for params_name in params:
for fn in legal_params_fns:
if has_arg(fn, params_name):
if params_name != 'nb_epoch':
raise ValueError(
'{} is not a legal parameter'.format(params_name))
def fit(self, X, y, **kwargs):
# X is a dict
# separate arguments into fit_args and estimator args
# if self.build_fn is None:
# self.model = self.__call__(**self.filter_sk_params(self.__call__))
# elif (not isinstance(self.build_fn, types.FunctionType) and
# not isinstance(self.build_fn, types.MethodType)):
# self.model = self.build_fn(
# **self.filter_sk_params(self.build_fn.__call__))
# else:
# self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
loss_name = self.model.loss
if hasattr(loss_name, '__name__'):
loss_name = loss_name.__name__
if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
y = to_categorical(y)
fit_args = copy.deepcopy(self.filter_sk_params(
# Since the input layers are named, I can pass the inputs as a dictionary
history =, y, **fit_args)
return history
def predict(self, X, **kwargs):
kwargs = self.filter_sk_params(Model.predict, kwargs)
return np.squeeze(self.model.predict(X, **kwargs), axis=-1)
def score(self, x, y, **kwargs):
kwargs = self.filter_sk_params(Model.evaluate, kwargs)
loss = self.model.evaluate(x, y, **kwargs)
if isinstance(loss, list):
return -loss[0]
return -loss
def graph_conv_model(hlayers_sizes='[10]', hidden_dropout=0, main_batchnorm=False,
hidden_activation='relu', learn_rate=0.01, initializer='glorot_uniform', optimizer='adam',
max_atoms=None, max_degree=None, num_atom_features=None, num_bond_features=None,
conv_layers_sizes = '[10]', fp_layer_size='[1024]', conv_activation = 'relu', fp_activation = 'softmax',
conv_bias=True, fp_bias=True, conv_l1 = 0, fp_l1 = 0, conv_l2 = 0, fp_l2 = 0, conv_dropout = 0,
fp_dropout = 0, conv_batchnorm = False, fp_batchnorm = False, conv_kwargs = {}, fp_kwargs = {},
atomwise_dropout = True, graphpool = False, prot_max_seq_len=0, prot_charseqset_size=0,
prot_num_filters=0, prot_filter_length=0):
"""Builds a Keras model with a drug encoding subnetwork based on molecular graph convolutions (Duvenaud et al.'s
neural fingerprint, a protein encoding subnetwork that uses Conv1D, and a fully connected prediction network that
predicts pKd values."""
hlayers_sizes = ast.literal_eval(hlayers_sizes) # hlayers_sizes was passed as a str because Pipeline throws RuntimeError when cloning the model if parameters are lists
conv_layers_sizes = ast.literal_eval(conv_layers_sizes) # same as above
fp_layer_size = ast.literal_eval(fp_layer_size) # same as above
# GraphConv subnetwork for drugs
atoms = Input(name='atoms', shape=(max_atoms, num_atom_features))
bonds = Input(name='bonds', shape=(max_atoms, max_degree, num_bond_features))
edges = Input(name='edges', shape=(max_atoms, max_degree), dtype='int32')
drug_fp = build_graph_conv_net(data_input=[atoms, bonds, edges],
conv_layer_sizes=conv_layers_sizes, fp_layer_size=fp_layer_size,
conv_activation=conv_activation, fp_activation=fp_activation,
conv_bias=conv_bias, fp_bias=fp_bias,
conv_l1=conv_l1, fp_l1=fp_l1,
conv_l2=conv_l2, fp_l2=fp_l2,
conv_dropout=conv_dropout, fp_dropout=fp_dropout,
conv_batchnorm=conv_batchnorm, fp_batchnorm=fp_batchnorm,
conv_kwargs=conv_kwargs, fp_kwargs=fp_kwargs,
# Conv1D subnetwork for proteins
prot_seqs = Input(name='prot_seqs', shape=(prot_max_seq_len,), dtype='int32')
prot_encoded = Embedding(input_dim=prot_charseqset_size+1, output_dim=128, input_length=prot_max_seq_len)(prot_seqs)
prot_encoded = Conv1D(filters=prot_num_filters, kernel_size=prot_filter_length, activation='relu', padding='valid',
prot_encoded = Conv1D(filters=prot_num_filters * 2, kernel_size=prot_filter_length, activation='relu', padding='valid',
prot_encoded = Conv1D(filters=prot_num_filters * 3, kernel_size=prot_filter_length, activation='relu', padding='valid',
prot_encoded = GlobalMaxPooling1D()(prot_encoded)
# Merge
x = concatenate([drug_fp, prot_encoded])
# Prediction network
for i in range(len(hlayers_sizes)):
x = Dense(units=hlayers_sizes[i], activation=hidden_activation, kernel_initializer=initializer)(x)
if main_batchnorm:
x = BatchNormalization()(x)
if hidden_dropout > 0:
x = Dropout(rate=hidden_dropout)(x)
output = Dense(1, kernel_initializer=initializer)(x)
# Create Keras Functional API Model (should this be done inside this function??)
model = Model(inputs=[atoms, bonds, edges, prot_seqs], outputs=[output])
# Define optimizer
if optimizer == 'adam':
opt = Adam(lr=learn_rate)
elif optimizer == 'sgd':
opt = SGD(lr=learn_rate, decay=1e-6, momentum=0.9,
nesterov=True) # 0.9 recommended value for momentum, and values used are the ones in the Keras example
elif optimizer == 'rmsprop':
opt = RMSprop(lr=learn_rate)
# Compile model
model.compile(loss='mean_squared_error', optimizer=opt)
return model
def build_graph_conv_net(data_input,
conv_layer_sizes=[], fp_layer_size=1024,
conv_activation='relu', fp_activation='softmax',
conv_bias=True, fp_bias=True,
conv_l1=0, fp_l1=0,
conv_l2=0, fp_l2=0,
conv_dropout=0, fp_dropout=0,
#conv_batchnorm=0, fp_batchnorm=0,
conv_batchnorm=False, fp_batchnorm=False,
conv_kwargs={}, fp_kwargs={},
atomwise_dropout=True, graphpool=False):
# Based on a modified version of this implementation:
# ======= Process network parameters =======
# Rename for consistency
fp_layer_sizes = fp_layer_size
# Ensure fp_layer_sizes is a list
if not is_iterable(fp_layer_sizes):
fp_layer_sizes = [fp_layer_sizes]
# Merge all parameter into tuples for each layer
# conv_layers = zip_mixed(conv_layer_sizes, conv_activation, conv_bias,
# conv_l1, conv_l2, conv_dropout, conv_batchnorm,
# conv_kwargs, graphpool, repeat_classes=[dict, str])
# fp_layers = zip_mixed(fp_layer_sizes, fp_activation, fp_bias,
# fp_l1, fp_l2, fp_dropout, fp_batchnorm,
# fp_kwargs, repeat_classes=[dict, str])
conv_layers = list(zip_mixed(conv_layer_sizes, conv_activation, conv_bias,
conv_l1, conv_l2, conv_dropout, conv_batchnorm,
conv_kwargs, graphpool, repeat_classes=[dict, str]))
fp_layers = list(zip_mixed(fp_layer_sizes, fp_activation, fp_bias,
fp_l1, fp_l2, fp_dropout, fp_batchnorm,
fp_kwargs, repeat_classes=[dict, str]))
# Ensure fp_layers is of length conv_layers+1
if len(fp_layer_sizes) != len(conv_layer_sizes)+1:
assert len(fp_layer_sizes) == 1, 'Incorrect amount of fingerprint layers specified. Either specify 1 or len(conv_layer_sizes)+1 ({}) fp_layer_sizes ({})'.format(len(conv_layer_sizes)+1, len(fp_layer_sizes))
# Add None for fp_layer_sizes and add None-tuples to fp_layers to align
# fp layers with conv_layers (the one layer provided will be the last layer)
fp_layer_sizes = [None]*len(conv_layer_sizes) + list(fp_layer_sizes)
fp_layers = [(None, )*len(fp_layers[0])] *len(conv_layer_sizes) + fp_layers
# Check zip result is the same length as specified sizes
assert len(conv_layers) == len(conv_layer_sizes), 'If `conv_`-layer-arguments are specified as a list, they should have the same length as `conv_layer_sizes` (length {0}), found an argument of lenght {1}'.format(len(conv_layer_sizes), len(conv_layers))
assert len(fp_layers) == len(fp_layer_sizes), 'If `fp`-layer-arguments are specified as a list, they should have the same length as `fp_layer_sizes` (len {0}), found an argument of lenght {1}'.format(len(fp_layer_sizes), len(fp_layers))
# ======= Build the network =======
# The inputs and outputs
atoms, bonds, edges = data_input
fingerprint_outputs = []
def ConvDropout(p_dropout):
""" Defines the standard Dropout layer for convnets"""
if atomwise_dropout:
return AtomwiseDropout(p_dropout)
return Dropout(p_dropout)
# Add first output layer directly to atom inputs
fp_size, fp_activation, fp_bias, fp_l1, fp_l2, fp_dropout, fp_batchnorm, fp_kwargs = fp_layers.pop(0)
if fp_size:
fp_atoms_in = atoms
if fp_batchnorm:
# fp_atoms_in = BatchNormalization(fp_batchnorm)(fp_atoms_in)
fp_atoms_in = BatchNormalization()(fp_atoms_in)
if fp_dropout:
fp_atoms_in = ConvDropout(fp_dropout)(fp_atoms_in)
fp_out = NeuralGraphOutput(fp_size, activation=fp_activation, use_bias=fp_bias,
kernel_regularizer=l1_l2(fp_l1, fp_l2),
bias_regularizer=l1_l2(fp_l1, fp_l2),
**fp_kwargs)([fp_atoms_in, bonds, edges])
# Add Graph convolutional layers
convolved_atoms = [atoms]
for conv_layer, fp_layer in zip(conv_layers, fp_layers):
# Import parameters
(conv_size, conv_activation, conv_bias, conv_l1, conv_l2, conv_dropout, conv_batchnorm,
conv_kwargs, graphpool) = conv_layer
fp_size, fp_activation, fp_bias, fp_l1, fp_l2, fp_dropout, fp_batchnorm, fp_kwargs = fp_layer
# Add hidden layer
atoms_in = convolved_atoms[-1]
# if conv_batchnorm:
# atoms_in = BatchNormalization(conv_batchnorm)(atoms_in)
if conv_batchnorm:
atoms_in = BatchNormalization()(atoms_in)
if conv_dropout:
atoms_in = ConvDropout(conv_dropout)(atoms_in)
# Use inner_layer_fn init method of `NeuralGraphHidden`, because it is
# the most powerful (e.g. allows custom activation functions)
def inner_layer_fn():
return Dense(conv_size, activation=conv_activation, use_bias=conv_bias,
kernel_regularizer=l1_l2(conv_l1, conv_l2),
bias_regularizer=l1_l2(conv_l1, conv_l2), **conv_kwargs)
atoms_out = NeuralGraphHidden(inner_layer_fn)([atoms_in, bonds, edges])
if graphpool:
atoms_out = NeuralGraphPool()([atoms_out, bonds, edges])
# Export
# Add output layer (if specified)
if fp_size:
fp_atoms_in = atoms_out
# if fp_batchnorm:
# fp_atoms_in = BatchNormalization(fp_batchnorm)(fp_atoms_in)
if fp_batchnorm:
fp_atoms_in = BatchNormalization()(fp_atoms_in)
if fp_dropout:
fp_atoms_in = ConvDropout(fp_dropout)(fp_atoms_in)
fp_out = NeuralGraphOutput(fp_size, activation=fp_activation, use_bias=fp_bias,
kernel_regularizer=l1_l2(fp_l1, fp_l2),
bias_regularizer=l1_l2(fp_l1, fp_l2),
**fp_kwargs)([fp_atoms_in, bonds, edges])
# Export
# Merge fingerprint
if len(fingerprint_outputs) > 1:
# final_fp = merge(fingerprint_outputs, mode=fp_merge_mode) # (Delora) merge is deprecated in Keras 2
final_fp = add(fingerprint_outputs)
final_fp = fingerprint_outputs[-1]
return final_fp
from src.data_preprocessing import *
import pandas as pd
import numpy as np
from NGF.preprocessing import tensorise_smiles
from src.data_preprocessing import add_chemical_identifier_to_df
import json
from itertools import chain
import requests
def create_adjacency_matrices(train_df, test_df):
"""Computes adjacency matrices for drugs based on SMILES"""
train_df.fillna(value="", axis=0, inplace=True)
test_df.fillna(value="", axis=0, inplace=True)
# some compounds might not have SMILES. doing this will allow tensorise_smiles to run, and it will just produce
# tensors filled with zeros for compounds without SMILES
train_atoms, train_bonds, train_edges = tensorise_smiles(train_df['canonical_smiles'].tolist())'generated/train_atoms.npy', train_atoms)'generated/train_bonds.npy', train_bonds)'generated/train_edges.npy', train_edges)
test_atoms, test_bonds, test_edges = tensorise_smiles(test_df['Compound_SMILES'].tolist(),
max_degree=train_bonds.shape[2])'generated/test_atoms.npy', test_atoms)'generated/test_bonds.npy', test_bonds)'generated/test_edges.npy', test_edges)
uniprot_accession_url = lambda \
accessions: '' + ','.join(accessions)
def get_json_from_uniprot(acc_list):
rq = requests.get(uniprot_accession_url(acc_list), headers={'Accept': 'application/json'})
dct = json.loads(rq.content)
return dct
def get_uniprot_data(protein_list, output_file):
full_entries = list(
chain(*[get_json_from_uniprot(protein_list[i:i + 100]) for i in range(0, len(protein_list), 100)]))
with open(output_file, 'w') as f:
json.dump(full_entries, f)
def get_dict_from_json(obj):
return {'sequence': obj['sequence']['sequence']}
def label_sequence(seq, max_seq_len):
"""Encodes protein sequences by mapping amino acids to integers"""
# adapted from DeepDTA code
charseqset = {"A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6, "F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12,
"O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18, "U": 19, "T": 20, "W": 21, "V": 22, "Y": 23,
"X": 24, "Z": 25}
X = np.zeros(max_seq_len)
for i, ch in enumerate(seq[:max_seq_len]):
X[i] = charseqset[ch]
return X
def create_prot_input_data(train_df, test_df, uniprot_json_file):
with open(uniprot_json_file, 'r') as f:
uniprot_json = json.load(f)
uniprot_seqs = {item['accession']: get_dict_from_json(item) for item in uniprot_json}
uniprot_df_seqs = pd.DataFrame.from_dict(uniprot_seqs).T # whole sequence for each protein
train_df = train_df.merge(uniprot_df_seqs, how='left', left_on='target_id', right_index=True)
test_df = test_df.merge(uniprot_df_seqs, how='left', left_on='UniProt_Id', right_index=True)
max_seq_len = int(train_df['sequence'].map(len).max())# return this value for later use
train_df['label_encoded'] = train_df['sequence'].apply(label_sequence, max_seq_len=max_seq_len)
test_df['label_encoded'] = test_df['sequence'].apply(label_sequence, max_seq_len=max_seq_len)
# should we save df, or only the label encoded protein data??'generated/train_prot.npy', np.asarray(train_df['label_encoded'].tolist()))'generated/test_prot.npy', np.asarray(test_df['label_encoded'].tolist()))
def create_y_train(train_df, output_file):, -np.log10(train_df['standard_value'] / (10 ** 9)).values.ravel())
if __name__ == '__main__':
training_data = add_chemical_identifier_to_df(preprocess_interaction_data('data/KD_data.csv'))
# TODO: use Kd samples in DtcDrugTargetInteractionsJan2019.csv file instead of KD_data.csv
test_data = pd.read_csv('data/round_2_template.csv')
# Limiting training data to proteins that are in the round 2 template only:
training_data = training_data.loc[training_data['target_id'].isin(test_data['UniProt_Id']), :]
training_data.to_csv('generated/KD_data_subset_round2.csv', index=False)
create_y_train(training_data, 'generated/y_train_conv.npy')
# Compounds
create_adjacency_matrices(training_data, test_data)
# Proteins
proteins = list(set([s.strip() for s in ','.join(training_data.target_id.dropna().unique()).split(',')]) | \
set(test_data.UniProt_Id.unique())) # only considering proteins in test set
get_uniprot_data(proteins, 'generated/uniprot_round2.json')
create_prot_input_data(training_data, test_data, 'generated/uniprot_round2.json')
# df2 = training_data.sample(frac=0.05, axis=0)
# df2.to_csv('test/conv_small_df.csv', index=False)
# create_y_train(df2, 'test/conv_small_df_y_train.npy')
# create_adjacency_matrices(df2, test_data)
# create_prot_input_data(df2, test_data, 'generated/uniprot_round2.json')
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from inspect import getmembers, isfunction
from src import evaluation_metrics
from src.drug_prot_conv import DrugProtConvModel, graph_conv_model
from sklearn.base import clone
from collections import defaultdict
from keras.models import load_model
from evaluation_metrics import average_AUC, rmse, spearman
def make_scorers():
functions_list = getmembers(evaluation_metrics, isfunction)
scorer_list = []
for metric_name, func in functions_list:
if metric_name == 'rmse':
scorer_list.append((metric_name, make_scorer(func, greater_is_better=False)))
elif (metric_name=='spearman') or (metric_name=='average_AUC'):
scorer_list.append((metric_name, make_scorer(func, greater_is_better=True)))
return dict(scorer_list)
# TODO: use DtcDrugTargetInteractionsJan2019.csv file instead of KD_data.csv?
# Load preprocessed data
train_dict = {'atoms':np.load('generated/train_atoms.npy'),
y_train = np.load('generated/y_train_conv.npy')
test_dict = {'atoms':np.load('generated/test_atoms.npy'),
# # Smaller files to test pipeline
# train_dict = {'atoms':np.load('test/small_df_train_atoms.npy'),
# 'bonds':np.load('test/small_df_train_bonds.npy'),
# 'edges':np.load('test/small_df_train_edges.npy'),
# 'prot_seqs':np.load('test/small_df_train_prot.npy')}
# y_train = np.load('test/conv_small_df_y_train.npy')
# test_dict = {'atoms':np.load('test/small_df_test_atoms.npy'),
# 'bonds':np.load('test/small_df_test_bonds.npy'),
# 'edges':np.load('test/small_df_test_edges.npy'),
# 'prot_seqs':np.load('test/small_df_test_prot.npy')}
# Model evaluation
hyperparam_vals = {'batch_size':32,
'epochs':200, # should we use EarlyStopping?
'hlayers_sizes':'[1000. 500]', # pick better layer sizes
'conv_layers_sizes':'[10, 10]',
'fp_layer_size':'[50, 50, 50]',
'prot_num_filters':32, # prot_num_filters=32 as in DeepDTA paper
'prot_filter_length':8} # filter length = [4,8,12] (range of values tested for this hyperparameter in the DeepDTA paper)
# TODO: pick hyperparam values
# TODO: create a ParamGrid to perform grid search for certain hyperparams??
# ValueError: "input_length" is 2549, but received input has shape (None, 2549, 25)
estimator = DrugProtConvModel(build_fn=graph_conv_model, verbose=2, **hyperparam_vals)
# Since we have multiple inputs that are stored in a dict, we can't use sklearn's cross_validate directly
# TODO: since we have so many samples, should we just split into train/test sets instead of using cross-validation??
kf = KFold(n_splits=5)
# split original dataset to get split indices
X = pd.read_csv('generated/KD_data_subset_round2.csv')
# X = pd.read_csv('test/conv_small_df.csv')
splits = kf.split(X)
cv_scores = defaultdict(list)
# scorers = make_scorers()
for train_index, test_index in splits:
# Split all X datasets in dict
X_train_dict = {}
X_test_dict = {}
for key in train_dict.keys():
X_train_dict[key] = train_dict[key][train_index]
X_test_dict[key] = train_dict[key][test_index]
# split y
y_fold_train = y_train[train_index]
y_fold_test = y_train[test_index]
# clone the model
model = clone(estimator) # using sklearn clone function, which is what is used in the sklearn cross validation functions
# TODO: should try to parallelize this like the sklearn cross validation functions do
# fit self.model, y_fold_train)
# predict test fold and calculate scores for fold
y_pred = model.predict(X_test_dict)
# calculate metrics after each fold, save to list and then use the average value as the final cv value for each metric??
# for scorer, score_func in scorers.items():
# cv_scores[scorer].append(score_func(y_fold_test, y_pred))
cv_scores['spearman'].append(spearman(y_fold_test, y_pred))
cv_scores['RMSE'].append(rmse(y_fold_test, y_pred))
cv_scores['average_AUC'].append(average_AUC(y_fold_test, y_pred))
for metric, scores in cv_scores.items():
avg = sum(scores)/len(scores)
avg_scores[metric] = avg
# TODO: instead of printing, save results and other model details to file
# Since the model uses custom layers that are not part of Keras, saving and then reloading the model doesn't work (ValueError: Unknown layer: NeuralGraphHidden)
# Save model
#, y_train)
# Load and re-wrap model
# model_filepath = 'dream/models/drug_prot_conv.h5'
# build_by_loading = lambda: load_model(model_filepath)
# model = DrugProtConvModel(build_fn=build_by_loading)
# model.model = build_by_loading # have to set the model because build_fn is only called when the fit method of the wrapper is called
# Predict test set
model = clone(estimator), y_train)
pred = model.predict(test_dict)
prediction_to_submit = pd.read_csv('data/round_2_template.csv')
prediction_to_submit['pKd_[M]_pred'] = pred
prediction_to_submit.to_csv('dream/round2/test_predictions.csv', index=None)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment