Commit c651b918 authored by Delora Baptista's avatar Delora Baptista
Browse files

[ADD] machine learning pipeline

parent 014ac135
from src import evaluation_metrics
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer
from inspect import getmembers, isfunction
def load_data():
# Load dataset (protein and compound features already calculated)
# output needs to be transformed to pKd (-log Kd)
# return X and y (if it exists)
def make_scorers():
functions_list = getmembers(evaluation_metrics, isfunction)
scorer_list = []
for metric_name, func in functions_list:
if metric_name == 'rmse':
scorer_list.append(make_scorer(func, greater_is_better=False))
scorer_list.append(make_scorer(func, greater_is_better=True))
return scorer_list
X_train, y_train = load_data()
X_test = load_data()
# change when we have the real dataset...
# Build pipeline
# numeric_features =
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
# categorical_features =
categorical_transformer = Pipeline(steps=['imputer', SimpleImputer(strategy='most_frequent'), ('onehot', OneHotEncoder())])
#binary_features =
binary_transformer = SimpleImputer(strategy='most_frequent')
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
('bin', binary_transformer, binary_features)])
selector = SelectKBest(k=100) # change k; use another method?
estimator = SVR() # need to change this easily when testing though
pipe = Pipeline(steps=[('preprocessor', preprocessor),
('feature_selector', selector),
('estimator', estimator)])
# Hyperparameter optimization???
# ...
# Model Evaluation
scorer_list = make_scorers()
cv_results = cross_validate(pipe, X_train, y_train, cv=10, scoring=scorer_list)
for res in cv_results:
# save cross-validation results to file?
# Prediction
model =, y_train)
predictions = model.predict(X_test)
# Save predictions to csv (use challenge template)
df = pd.read_csv('../data/round_1_template.csv')
df = pd.concat([df, predictions], axis=1) # make sure order is maintained in predictions
df.to_csv('round1_predictions.csv') # change later
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment