Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit c66ab7b8 authored by Vítor Vieira's avatar Vítor Vieira
Browse files

[ADD] Pre-processing and model training

parent 141a3556
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from src.get_molecules import *
import re
def get_chemical_mapping_df(chembl_map_path='data/chembl_24_1_chemreps.txt.gz', index_by='standard_inchi_key'):
return pd.read_csv(chembl_map_path, sep='\t').set_index(index_by, drop=True)
def preprocess_interaction_data(data_path='data/KD_data.csv',
cols_to_keep=('standard_inchi_key', 'target_id', 'standard_value')):
df = pd.read_csv(data_path, index_col=0)
df = df[~df[list(cols_to_keep)].T.isna().any()].dropna(axis=1)[list(cols_to_keep)]
df = df[~df.target_id.apply(lambda x: ',' in x)]
return df
def add_chemical_identifier_to_df(df, chemical_mapping_df=None, index_identifier='standard_inchi_key', new_identifier='canonical_smiles'):
if not chemical_mapping_df:
chemical_mapping_df = get_chemical_mapping_df(chembl_map_path='data/chembl_24_1_chemreps.txt.gz', index_by=index_identifier)
return df.set_index(index_identifier).join(chemical_mapping_df[new_identifier], how='left').reset_index()
def get_compound_feature_dataframe(df, descriptors, smiles_column='canonical_smiles'):
compound_df = df[[smiles_column]].rename(columns={smiles_column: 'smiles'}).drop_duplicates()
descriptor_dfs = list(map(lambda x: df_for_molecular_descriptors(compound_df, x), descriptors))
descriptor_dfs_no_na = [ind_df.dropna(axis=0) for ind_df in descriptor_dfs]
del descriptor_dfs
tdfs = []
integer_descriptors = ['Morgan', 'MACCS']
for descriptor, dcdf in zip(descriptors, descriptor_dfs_no_na):
if dcdf[descriptor].dtype == 'O':
feature_df = pd.DataFrame.from_dict(dict(zip(dcdf.index, dcdf[descriptor].values))).T
if descriptor in integer_descriptors:
feature_df = feature_df.astype(int)
feature_df.columns = [descriptor + "_" + str(i) for i in range(feature_df.shape[1])]
feature_df['smiles'] = dcdf.smiles
else:
feature_df = dcdf
tdfs.append(feature_df)
final_descriptor_df = pd.concat([df.set_index('smiles') for df in tdfs], axis=1, join='inner')
return final_descriptor_df
def seq_to_uniprot_id_map(domseq_to_uniprot_path):
return pd.read_csv(domseq_to_uniprot_path, index_col=0)
def get_domain_dataframe(dom_dataframe_path):
df = pd.read_csv(dom_dataframe_path)
return df
def get_accession_to_identifiers_df(acc_to_identifiers_path, domseq_to_uniprot_path):
acc_to_identifiers_df = pd.read_csv(acc_to_identifiers_path, index_col=0)
domseq_map = seq_to_uniprot_id_map(domseq_to_uniprot_path)
return acc_to_identifiers_df.loc[domseq_map.uniprot_acc.unique()]
def get_multiple_alignment_features_df(multiple_alignf_path):
df = pd.read_csv(multiple_alignf_path, index_col=0)
df.index = [x.split('|')[1] for x in df.index]
return df
def get_protein_features_df(protein_features_path, domseq_to_uniprot_path, drop_columns):
def read_list_from_str(list_str):
pattern = '[\-0-9\.]+'
matcher = re.compile(pattern)
float_list = [float(x) for x in matcher.findall(list_str)]
return float_list
dseq_map = seq_to_uniprot_id_map(domseq_to_uniprot_path)
df = pd.read_csv(protein_features_path, index_col=0).drop_duplicates().drop(columns=drop_columns)
subset_df = df.loc[dseq_map.index, :]
merged_df = subset_df.join(dseq_map, how='inner').set_index('uniprot_acc').drop(columns='pfamA_acc')
merged_df = (merged_df.loc[~merged_df.isna().T.apply(lambda x: x.all())]).dropna(axis=1, how='all')
list_df = merged_df.applymap(read_list_from_str)
final_df = None
for col in list_df.columns:
try:
df_for_col = pd.DataFrame.from_dict(dict(zip(list_df.index, list_df[col].values))).T
print(df_for_col.shape)
df_for_col.columns = [col + str(i) for i in range(df_for_col.shape[1])]
if final_df is None:
final_df = df_for_col
else:
final_df = final_df.join(df_for_col, how='left')
except:
print(col, 'failed')
return final_df
def load_interaction_data(interactions_path):
interaction_data = pd.read_csv(interactions_path, index_col=0)
def interaction_to_index_map(df):
d = {}
for index, row in zip(df.index, df[['compound_id', 'target_id']].values):
if tuple(row) not in d:
d[tuple(row)] = [index]
else:
d[tuple(row)].append(index)
return d
int_to_ind_map = interaction_to_index_map(interaction_data)
return interaction_data, int_to_ind_map
def generate_data(acc_to_identifiers_path, multiple_alignf_path, domseq_to_uniprot_path,
interactions_path, protein_features_to_drop, protein_features_path, mol_descriptors, write=False):
# Load dataset (protein and compound features already calculated)
# output needs to be transformed to pKd (-log Kd)
# return X and y (if it exists)
#test_template = pd.read_csv(template_path)
acc_to_identifiers_df = get_accession_to_identifiers_df(acc_to_identifiers_path, domseq_to_uniprot_path)
multiple_alignment_features_df = get_multiple_alignment_features_df(multiple_alignf_path)
protein_features_df = get_protein_features_df(protein_features_path, domseq_to_uniprot_path, protein_features_to_drop)
interaction_data = add_chemical_identifier_to_df(preprocess_interaction_data(interactions_path))
interaction_data = interaction_data[interaction_data.target_id.isin(protein_features_df.index)]
compound_df = get_compound_feature_dataframe(interaction_data, mol_descriptors)
final_protein_df = acc_to_identifiers_df.join(multiple_alignment_features_df).join(protein_features_df)
if write:
final_protein_df.to_csv('generated/TRAIN_protein_data.csv')
interaction_data.to_csv('generated/TRAIN_intrx_data.csv')
compound_df.to_csv('generated/TRAIN_compound_df.csv')
return interaction_data, compound_df, final_protein_df
\ No newline at end of file
from src import evaluation_metrics
from src import evaluation_metrics, data_preprocessing
from src.data_preprocessing import *
import pandas as pd
import numpy as np
import re
......@@ -7,13 +8,13 @@ from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer
from inspect import getmembers, isfunction
test_template_path = 'dream/round_1_template.csv'
template_path = 'dream/round_1_template.csv' # done
dom_dataframe_path = 'generated/domain_sequences.csv' # done
interactions_path = 'generated/training_df.csv' # done
......@@ -23,103 +24,36 @@ domseq_to_uniprot_path = 'generated/domain_sequence_to_uniprot_mapping.csv' # nv
multiple_alignf_path = 'generated/multiple_align_features.csv'
protein_features_path = 'generated/protein_features.csv'
test_df = pd.DataFrame([[1]*5,[2]*5,[3]*5]).T
test_df.index = [1,1,2,2,3]
test_df.loc[2,:]
def seq_to_uniprot_id_map(domseq_to_uniprot_path):
return pd.read_csv(domseq_to_uniprot_path, index_col=0)
def get_domain_dataframe(dom_dataframe_path):
df = pd.read_csv(dom_dataframe_path)
return df
def get_accession_to_identifiers_df(acc_to_identifiers_path, domseq_to_uniprot_path):
acc_to_identifiers_df = pd.read_csv(acc_to_identifiers_path, index_col=0)
domseq_map = seq_to_uniprot_id_map(domseq_to_uniprot_path)
return acc_to_identifiers_df.loc[domseq_map.uniprot_acc.unique()]
def get_multiple_alignment_features_df(multiple_alignf_path):
df = pd.read_csv(multiple_alignf_path, index_col=0)
df.index = [x.split('|')[1] for x in df.index]
return df
def get_protein_features_df(protein_features_path, domseq_to_uniprot_path):
def read_list_from_str(list_str):
pattern = '[\-0-9\.]+'
matcher = re.compile(pattern)
float_list = [float(x) for x in matcher.findall(list_str)]
return float_list
dseq_map = seq_to_uniprot_id_map(domseq_to_uniprot_path)
df = pd.read_csv(protein_features_path, index_col=0).drop_duplicates()
subset_df = df.loc[dseq_map.index,:]
merged_df = subset_df.join(dseq_map, how='inner').set_index('uniprot_acc').drop(columns='pfamA_acc')
merged_df = (merged_df.loc[~merged_df.isna().T.apply(lambda x: x.all())]).dropna(axis=1, how='all')
list_df = merged_df.applymap(read_list_from_str)
final_df = None
for col in list_df.columns:
try:
df_for_col = pd.DataFrame.from_dict(dict(zip(list_df.index, list_df[col].values))).T
print(df_for_col.shape)
df_for_col.columns = [col+str(i) for i in range(df_for_col.shape[1])]
if final_df is None:
final_df = df_for_col
else:
final_df = final_df.join(df_for_col, how='left')
except:
print(col,'failed')
return final_df
def load_interaction_data(interactions_path):
interaction_data = pd.read_csv(interactions_path, index_col=0)
def interaction_to_index_map(df):
d = {}
for index, row in zip(df.index, df[['compound_id', 'target_id']].values):
if tuple(row) not in d:
d[tuple(row)] = [index]
else:
d[tuple(row)].append(index)
return d
int_to_ind_map = interaction_to_index_map(interaction_data)
return interaction_data, int_to_ind_map
def generate_data(template_path, acc_to_identifiers_path, multiple_alignf_path, domseq_to_uniprot_path, interactions_path):
# Load dataset (protein and compound features already calculated)
# output needs to be transformed to pKd (-log Kd)
# return X and y (if it exists)
test_template = pd.read_csv(template_path)
acc_to_identifiers_df = get_accession_to_identifiers_df(acc_to_identifiers_path, domseq_to_uniprot_path)
multiple_alignment_features_df = get_multiple_alignment_features_df(multiple_alignf_path)
protein_features_df = get_protein_features_df(protein_features_path, domseq_to_uniprot_path)
interaction_data, int_to_ind_map = load_interaction_data(interactions_path)
interaction_data = interaction_data[interaction_data.target_id.isin(protein_features_df.index)]
final_protein_df = acc_to_identifiers_df.join(multiple_alignment_features_df).join(protein_features_df)
final_protein_df.to_csv('generated/TRAIN_protein_data.csv')
interaction_data.to_csv('generated/TRAIN_intrx_compound_data.csv')
def load_training_data(protein_data_path, intrx_compound_path):
## TODO: Deal with MemoryError
final_protein_df = pd.read_csv(protein_data_path, index_col = 0)
interaction_data = pd.read_csv(intrx_compound_path, index_col = 0)
X = interaction_data.set_index('target_id', drop=True).join(final_protein_df)
return X.drop(columns=['standard_value']), X['standard_value']
protein_data_path = 'generated/TRAIN_protein_data.csv'
intrx_compound_path = 'generated/TRAIN_intrx_compound_data.csv'
def load_test_data(template_path):
def load_features():
## TODO: Deal with MemoryError
#final_protein_df = pd.read_csv(protein_data_path, index_col = 0)
protein_features_to_drop = [
'APAAC',
'Moran',
'Geary',
'Tripeptide'
]
interaction_data, compound_df, final_protein_df = generate_data(
acc_to_identifiers_path,
multiple_alignf_path,
domseq_to_uniprot_path,
interactions_path,
protein_features_to_drop,
protein_features_path,
mol_descriptors=['Morgan','MACCS','MolLogP'])
return interaction_data, compound_df, final_protein_df
def load_test_data(test_template_path):
## TODO: Needs to be implemented
template = pd.read_csv(template_path)
template
test_compound_df = get_compound_feature_dataframe(template, ['Morgan','MACCS','MolLogP'], 'Compound_SMILES')
test_protein_df = final_protein_df.loc[template.UniProt_Id]
def make_scorers():
functions_list = getmembers(evaluation_metrics, isfunction)
......@@ -132,12 +66,44 @@ def make_scorers():
return scorer_list
protein_data_path = 'generated/TRAIN_protein_data.csv'
intrx_compound_path = 'generated/TRAIN_intrx_compound_data.csv'
def load_training_data():
interaction_data, compound_df, final_protein_df = load_features()
different_features = final_protein_df.apply(lambda x: len(x.value_counts()) > 1)
final_protein_df_1 = final_protein_df.loc[:,different_features]
p = 0.01
vt = VarianceThreshold(threshold=0.01)
ppl = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')), ('vt',vt)])
ppl.fit(final_protein_df_1)
final_protein_df_2 = final_protein_df_1.loc[:,ppl.steps[-1][1].get_support()]
df_w_compounds = interaction_data.set_index('canonical_smiles').join(compound_df).reset_index()
df_w_all = df_w_compounds.set_index('target_id').join(final_protein_df_2).reset_index()
## TODO: Remove compound and protein indexes (smiles/uniprot)
return df_w_all
def get_training_df():
df_w_all = load_training_data().drop(columns=['level_0', 'index', 'standard_inchi_key'])
X_train, y_train = df_w_all.drop(columns=['standard_value']), df_w_all['standard_value']
numeric_features, binary_features = [], []
X_train, y_train = load_training_data(protein_data_path, intrx_compound_path)
for i,col in enumerate(X_train.columns):
if df_w_all[col].dtype == np.float:
numeric_features.append(i)
elif df_w_all[col].dtype == np.int and len(df_w_all[col].value_counts().index) == 2:
binary_features.append(i)
return X_train, y_train, numeric_features, binary_features
X_train, y_train, numeric_features, binary_features = get_training_df()
#X_test = load_data()
# change when we have the real dataset...
......@@ -145,14 +111,14 @@ X_train, y_train = load_training_data(protein_data_path, intrx_compound_path)
# numeric_features =
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
# categorical_features =
categorical_transformer = Pipeline(steps=['imputer', SimpleImputer(strategy='most_frequent'), ('onehot', OneHotEncoder())])
#categorical_transformer = Pipeline(steps=['imputer', SimpleImputer(strategy='most_frequent'), ('onehot', OneHotEncoder())])
#binary_features =
binary_transformer = SimpleImputer(strategy='most_frequent')
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
# ('cat', categorical_transformer, categorical_features),
('bin', binary_transformer, binary_features)])
selector = SelectKBest(k=100) # change k; use another method?
selector = SelectKBest(k=1000) # change k; use another method?
estimator = SVR() # need to change this easily when testing though
pipe = Pipeline(steps=[('preprocessor', preprocessor),
('feature_selector', selector),
......
import pandas as pd
mport pandas as pd
import json
import numpy
import requests
......@@ -109,7 +109,7 @@ domain_df.to_csv('generated/domain_sequences.csv')
# TODO: THIS SHOULD BE A SEPARATE SCRIPT
training_df = pd.read_csv('generated/training_df.csv')
domain_df = pd.read_csv('generated/domain_sequences.csv')
domain_df = pd.read_csv('generated/domain_sequences.csv', index_col = 0)
acc_to_identifiers_df = pd.read_csv('generated/accession_to_identifiers_df.csv')
with open('generated/uniprot_df.json', 'r') as f:
......@@ -182,6 +182,9 @@ sub_domain_df = domain_df[domain_df['clan'] == 'CL0016'].reset_index()
domains_to_add = sub_domain_df.pfamA_acc.value_counts()[sub_domain_df.pfamA_acc.value_counts() > 4].index.values
sub_domain_df = sub_domain_df[sub_domain_df.pfamA_acc.isin(domains_to_add)]
sub_domain_df_to_write = sub_domain_df.copy().set_index('domain_sequence').drop(columns=['clan','index'])
sub_domain_df_to_write.to_csv('generated/domain_sequence_to_uniprot_mapping.csv')
# sub_domain_df = sub_domain_df[~sub_domain_df.uniprot_acc.isin(to_remove_list)]
seqs_to_align = sub_domain_df.values
......
......@@ -7,6 +7,16 @@ df = df[~df['compound_id'].isnull()]
df.index = range(df.shape[0])
def get_chemical_mapping_df(chembl_map_path='data/chembl_24_1_chemreps.txt.gz', index_by='standard_inchi_key'):
return pd.read_csv(chembl_map_path, sep='\t').set_index(index_by, drop=True)
def preprocess_interaction_data(data_path='data/KD_data.csv', cols_to_keep=('compound_id', 'target_id', 'standard_value')):
df = pd.read_csv(data_path, index_col=0)
df= df[~df[list(cols_to_keep)].T.isna().any()].dropna(axis=1)[list(cols_to_keep)]
return df
chembl_reps = pd.read_csv('data/chembl_24_1_chemreps.txt.gz', sep='\t').set_index('chembl_id')
compound_reps = chembl_reps.loc[df['compound_id'],:]
chembl_reps['chembl_id'] = chembl_reps.index
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment