Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 141a3556 authored by Vítor Vieira's avatar Vítor Vieira
Browse files

[ADD] Training dataset generation

parent eff6afef
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from src import evaluation_metrics
import pandas as pd
import numpy as np
import re
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
......@@ -10,11 +14,112 @@ from sklearn.compose import ColumnTransformer
from inspect import getmembers, isfunction
def load_data():
template_path = 'dream/round_1_template.csv' # done
dom_dataframe_path = 'generated/domain_sequences.csv' # done
interactions_path = 'generated/training_df.csv' # done
acc_to_identifiers_path = 'generated/accession_to_identifiers_df.csv' # done
uniprot_json_path = 'generated/uniprot_df.json' # nvm
domseq_to_uniprot_path = 'generated/domain_sequence_to_uniprot_mapping.csv' # nvm
multiple_alignf_path = 'generated/multiple_align_features.csv'
protein_features_path = 'generated/protein_features.csv'
test_df = pd.DataFrame([[1]*5,[2]*5,[3]*5]).T
test_df.index = [1,1,2,2,3]
test_df.loc[2,:]
def seq_to_uniprot_id_map(domseq_to_uniprot_path):
return pd.read_csv(domseq_to_uniprot_path, index_col=0)
def get_domain_dataframe(dom_dataframe_path):
df = pd.read_csv(dom_dataframe_path)
return df
def get_accession_to_identifiers_df(acc_to_identifiers_path, domseq_to_uniprot_path):
acc_to_identifiers_df = pd.read_csv(acc_to_identifiers_path, index_col=0)
domseq_map = seq_to_uniprot_id_map(domseq_to_uniprot_path)
return acc_to_identifiers_df.loc[domseq_map.uniprot_acc.unique()]
def get_multiple_alignment_features_df(multiple_alignf_path):
df = pd.read_csv(multiple_alignf_path, index_col=0)
df.index = [x.split('|')[1] for x in df.index]
return df
def get_protein_features_df(protein_features_path, domseq_to_uniprot_path):
def read_list_from_str(list_str):
pattern = '[\-0-9\.]+'
matcher = re.compile(pattern)
float_list = [float(x) for x in matcher.findall(list_str)]
return float_list
dseq_map = seq_to_uniprot_id_map(domseq_to_uniprot_path)
df = pd.read_csv(protein_features_path, index_col=0).drop_duplicates()
subset_df = df.loc[dseq_map.index,:]
merged_df = subset_df.join(dseq_map, how='inner').set_index('uniprot_acc').drop(columns='pfamA_acc')
merged_df = (merged_df.loc[~merged_df.isna().T.apply(lambda x: x.all())]).dropna(axis=1, how='all')
list_df = merged_df.applymap(read_list_from_str)
final_df = None
for col in list_df.columns:
try:
df_for_col = pd.DataFrame.from_dict(dict(zip(list_df.index, list_df[col].values))).T
print(df_for_col.shape)
df_for_col.columns = [col+str(i) for i in range(df_for_col.shape[1])]
if final_df is None:
final_df = df_for_col
else:
final_df = final_df.join(df_for_col, how='left')
except:
print(col,'failed')
return final_df
def load_interaction_data(interactions_path):
interaction_data = pd.read_csv(interactions_path, index_col=0)
def interaction_to_index_map(df):
d = {}
for index, row in zip(df.index, df[['compound_id', 'target_id']].values):
if tuple(row) not in d:
d[tuple(row)] = [index]
else:
d[tuple(row)].append(index)
return d
int_to_ind_map = interaction_to_index_map(interaction_data)
return interaction_data, int_to_ind_map
def generate_data(template_path, acc_to_identifiers_path, multiple_alignf_path, domseq_to_uniprot_path, interactions_path):
# Load dataset (protein and compound features already calculated)
# output needs to be transformed to pKd (-log Kd)
# return X and y (if it exists)
pass
test_template = pd.read_csv(template_path)
acc_to_identifiers_df = get_accession_to_identifiers_df(acc_to_identifiers_path, domseq_to_uniprot_path)
multiple_alignment_features_df = get_multiple_alignment_features_df(multiple_alignf_path)
protein_features_df = get_protein_features_df(protein_features_path, domseq_to_uniprot_path)
interaction_data, int_to_ind_map = load_interaction_data(interactions_path)
interaction_data = interaction_data[interaction_data.target_id.isin(protein_features_df.index)]
final_protein_df = acc_to_identifiers_df.join(multiple_alignment_features_df).join(protein_features_df)
final_protein_df.to_csv('generated/TRAIN_protein_data.csv')
interaction_data.to_csv('generated/TRAIN_intrx_compound_data.csv')
def load_training_data(protein_data_path, intrx_compound_path):
## TODO: Deal with MemoryError
final_protein_df = pd.read_csv(protein_data_path, index_col = 0)
interaction_data = pd.read_csv(intrx_compound_path, index_col = 0)
X = interaction_data.set_index('target_id', drop=True).join(final_protein_df)
return X.drop(columns=['standard_value']), X['standard_value']
def load_test_data(template_path):
## TODO: Needs to be implemented
template = pd.read_csv(template_path)
template
def make_scorers():
functions_list = getmembers(evaluation_metrics, isfunction)
......@@ -26,8 +131,14 @@ def make_scorers():
scorer_list.append(make_scorer(func, greater_is_better=True))
return scorer_list
X_train, y_train = load_data()
X_test = load_data()
protein_data_path = 'generated/TRAIN_protein_data.csv'
intrx_compound_path = 'generated/TRAIN_intrx_compound_data.csv'
X_train, y_train = load_training_data(protein_data_path, intrx_compound_path)
#X_test = load_data()
# change when we have the real dataset...
# Build pipeline
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment