Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 8cc120e8 authored by Vítor Vieira's avatar Vítor Vieira
Browse files

Merge remote-tracking branch 'origin/master'

parents c0a3fb01 6783d9ea
import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.nan)
from pydpi import pypro
'''Proteins using pydpi'''
def _create_matrix_with_data(df, column):
def __convert_str_to_numpy(string):
if string is not np.nan:
return np.fromstring(''.join(string).replace('[', '').replace(']', '').replace(' ', ','), sep=',')
df[column] = df[column].apply(lambda x: __convert_str_to_numpy(x))
i = 0
length = [len(df[column][i]) if isinstance(df[column][i], np.ndarray) else None for i in range(0, len(df[column]))]
new_df = pd.DataFrame(np.concatenate(
[x.reshape(1, -1) if isinstance(x, np.ndarray) else np.array([np.nan] * length[0]).reshape(1, -1) for x in
(df[column]).tolist()], axis=0)).set_index(df.iloc[:, 0])
return new_df
def proteins(df_with_chembl_and_canonical_smiles, df=None):
'''
:param df_with_chembl_and_canonical_smiles:
:return:
'''
'''Auxiliary functions'''
def _protein_descriptors(protein_object):
def __check_none(result):
return result if result is not None else np.nan
def __get_descriptor(desc_function):
try:
desc = np.array(desc_function().values())
except Exception as e:
print(e)
desc = np.nan
return __check_none(desc)
descriptors_functions = {'AAComp': __get_descriptor(protein_object.GetAAComp),
'CTD': __get_descriptor(protein_object.GetCTD),
'DPComp': __get_descriptor(protein_object.GetDPComp),
'Moran': __get_descriptor(protein_object.GetMoranAuto),
'APAAC': __get_descriptor(protein_object.GetAPAAC),
'QSO': __get_descriptor(protein_object.GetQSO),
'Triad': __get_descriptor(protein_object.GetTriad),
'SOCN': __get_descriptor(protein_object.GetSOCN),
'Tripeptide': __get_descriptor(protein_object.GetTPComp)}
return pd.Series(descriptors_functions)
'''End of auxiliary functions'''
descriptors = pd.Series(['AAComp', 'CTD', 'DPComp', 'APAAC', 'Moran', 'QSO', 'Triad', 'SOCN', 'Tripeptide'])
if df is None:
df = pd.DataFrame(data=None, columns=pd.Series(df_with_chembl_and_canonical_smiles['accession'].unique()),
index=descriptors)
else:
df = df
po = pypro.PyPro() # po = ProteinObject
missing_proteins=[]
for protein in df.columns:
print(protein)
if df.loc['AAComp'][protein] is np.nan:
print('Redoing the analysis')
try:
protein_object = po.GetProteinSequenceFromID(protein)
po.ReadProteinSequence(protein_object)
df[protein] = pd.Series(_protein_descriptors(po))
except Exception as e:
print(e)
missing_proteins.append(protein)
return df.T, missing_proteins
if __name__ == '__main__':
chembl = pd.read_csv('chembl23_selected.csv')
df = pd.read_csv('protein_descriptors.csv',index_col=1,)
df.index = df['Unnamed: 0.1']
df_final = df.iloc[:,2:].T
x,v = proteins(chembl, df_final)
#todo
asd = _create_matrix_with_data(df_final,'Tripeptide')
#todo
x.to_csv('protein_descriptors.csv')
pd.Series(v).to_csv('missing_proteins.txt')
# dfxx = pd.DataFrame(np.concatenate(
# [x.reshape(1, -1) if isinstance(x, np.ndarray) else np.array([np.nan] * 167).reshape(1,-1) for x in
# (qwer.MACCS).tolist()], axis=0)).set_index(qwer.iloc[:,0])
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment