Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Vítor Vieira
idg-dream-challenge
Commits
c651b918
Commit
c651b918
authored
Nov 13, 2018
by
Delora Baptista
Browse files
[ADD] machine learning pipeline
parent
014ac135
Changes
1
Hide whitespace changes
Inline
Side-by-side
src/machine_learning.py
0 → 100644
View file @
c651b918
from
src
import
evaluation_metrics
from
sklearn.svm
import
SVR
from
sklearn.pipeline
import
Pipeline
from
sklearn.impute
import
SimpleImputer
from
sklearn.preprocessing
import
StandardScaler
,
OneHotEncoder
from
sklearn.feature_selection
import
SelectKBest
from
sklearn.model_selection
import
cross_validate
from
sklearn.metrics
import
make_scorer
from
sklearn.compose
import
ColumnTransformer
from
inspect
import
getmembers
,
isfunction
def
load_data
():
# Load dataset (protein and compound features already calculated)
# output needs to be transformed to pKd (-log Kd)
# return X and y (if it exists)
pass
def
make_scorers
():
functions_list
=
getmembers
(
evaluation_metrics
,
isfunction
)
scorer_list
=
[]
for
metric_name
,
func
in
functions_list
:
if
metric_name
==
'rmse'
:
scorer_list
.
append
(
make_scorer
(
func
,
greater_is_better
=
False
))
else
:
scorer_list
.
append
(
make_scorer
(
func
,
greater_is_better
=
True
))
return
scorer_list
X_train
,
y_train
=
load_data
()
X_test
=
load_data
()
# change when we have the real dataset...
# Build pipeline
# numeric_features =
numeric_transformer
=
Pipeline
(
steps
=
[(
'imputer'
,
SimpleImputer
(
strategy
=
'median'
)),
(
'scaler'
,
StandardScaler
())])
# categorical_features =
categorical_transformer
=
Pipeline
(
steps
=
[
'imputer'
,
SimpleImputer
(
strategy
=
'most_frequent'
),
(
'onehot'
,
OneHotEncoder
())])
#binary_features =
binary_transformer
=
SimpleImputer
(
strategy
=
'most_frequent'
)
preprocessor
=
ColumnTransformer
(
transformers
=
[(
'num'
,
numeric_transformer
,
numeric_features
),
(
'cat'
,
categorical_transformer
,
categorical_features
),
(
'bin'
,
binary_transformer
,
binary_features
)])
selector
=
SelectKBest
(
k
=
100
)
# change k; use another method?
estimator
=
SVR
()
# need to change this easily when testing though
pipe
=
Pipeline
(
steps
=
[(
'preprocessor'
,
preprocessor
),
(
'feature_selector'
,
selector
),
(
'estimator'
,
estimator
)])
# Hyperparameter optimization???
# ...
# Model Evaluation
scorer_list
=
make_scorers
()
cv_results
=
cross_validate
(
pipe
,
X_train
,
y_train
,
cv
=
10
,
scoring
=
scorer_list
)
for
res
in
cv_results
:
print
(
res
+
':
\n
'
)
print
(
cv_results
[
res
])
# save cross-validation results to file?
# Prediction
model
=
pipe
.
fit
(
X_train
,
y_train
)
predictions
=
model
.
predict
(
X_test
)
# Save predictions to csv (use challenge template)
df
=
pd
.
read_csv
(
'../data/round_1_template.csv'
)
df
=
pd
.
concat
([
df
,
predictions
],
axis
=
1
)
# make sure order is maintained in predictions
df
.
to_csv
(
'round1_predictions.csv'
)
# change later
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment