Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 5e78813a authored by Vítor Vieira's avatar Vítor Vieira
Browse files

[ADD] Training dataset generation

parent 8d21c0bf
from src import evaluation_metrics, data_preprocessing
from src import evaluation_metrics
from src.data_preprocessing import *
import pandas as pd
import numpy as np
......@@ -48,13 +48,18 @@ def load_features():
return interaction_data, compound_df, final_protein_df
def load_test_data(test_template_path):
def load_test_data():
## TODO: Needs to be implemented
template = pd.read_csv(template_path)
template
test_compound_df = get_compound_feature_dataframe(template, ['Morgan','MACCS','MolLogP'], 'Compound_SMILES')
test_protein_df = final_protein_df.loc[template.UniProt_Id]
template_w_compounds = template.set_index('Compound_SMILES').join(test_compound_df).reset_index()
template_w_all = template_w_compounds.set_index('UniProt_Id').join(test_protein_df).reset_index()
return template_w_all
def make_scorers():
functions_list = getmembers(evaluation_metrics, isfunction)
scorer_list = []
......@@ -87,8 +92,24 @@ def load_training_data():
## TODO: Remove compound and protein indexes (smiles/uniprot)
return df_w_all
def get_training_df():
df_w_all = load_training_data().drop(columns=['level_0', 'index', 'standard_inchi_key'])
#
# template = pd.read_csv(template_path)
# interaction_data = add_chemical_identifier_to_df(preprocess_interaction_data(interactions_path))
#
# sub_int_data = interaction_data[interaction_data.target_id.isin(template.UniProt_Id)]
# sub_int_data.target_id.value_counts()
# comp_df = get_compound_feature_dataframe(template, ['Morgan','MACCS','MolLogP'], 'Compound_SMILES')
#
X = load_training_data()
# (template.UniProt_Id == 'P00519').sum()
# test_data = load_test_data()
#
# X = X[X['level_0'].isin(template.UniProt_Id)]
#
# len((X['level_0'] + X['index']).unique())
def get_training_df(df):
df_w_all = df.drop(columns=['level_0', 'index', 'standard_inchi_key'])
X_train, y_train = df_w_all.drop(columns=['standard_value']), df_w_all['standard_value']
......@@ -103,12 +124,25 @@ def get_training_df():
return X_train, y_train, numeric_features, binary_features
X_train, y_train, numeric_features, binary_features = get_training_df()
X_train, y_train, numeric_features, binary_features = get_training_df(X)
X_train.to_csv('generated/final_X_train.csv')
y_train.to_csv('generated/final_y_train.csv')
with open('generated/final_x_numerical_features.pkl', 'wb') as f:
pickle.dump(numeric_features, f)
with open('generated/final_x_binary_features.pkl', 'wb') as f:
pickle.dump(binary_features, f)
#X_test = load_data()
# change when we have the real dataset...
# Build pipeline
# numeric_features =
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
# categorical_features =
#categorical_transformer = Pipeline(steps=['imputer', SimpleImputer(strategy='most_frequent'), ('onehot', OneHotEncoder())])
......@@ -117,13 +151,17 @@ binary_transformer = SimpleImputer(strategy='most_frequent')
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
# ('cat', categorical_transformer, categorical_features),
('bin', binary_transformer, binary_features)])
selector = SelectKBest(k=1000) # change k; use another method?
variance_threshold = VarianceThreshold(threshold=0.05)
selector = SelectKBest(k=500) # change k; use another method?
estimator = SVR() # need to change this easily when testing though
pipe = Pipeline(steps=[('preprocessor', preprocessor),
('feature_selector', selector),
('estimator', estimator)])
pipe = Pipeline(steps=[('variance', variance_threshold),
('preprocessor', preprocessor),
('feature_selector', selector)])
# ('estimator', estimator)])
pipe.fit(X_train, y_train)
aaa = pipe.transform(X_train)
sum(aaa)
# Hyperparameter optimization???
# ...
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment