Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 6783d9ea authored by Delora Baptista's avatar Delora Baptista
Browse files

Merge remote-tracking branch 'origin/master'

parents e1c90a7c 8285567b
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import json
import numpy
import requests
file_path, sep = '/home/skapur/Pfam-A.regions.uniprot.tsv.gz', '\t'
rcols = ['uniprot_acc','seq_version','pfamA_acc','seq_start','seq_end']
rcols = ['uniprot_acc', 'seq_version', 'pfamA_acc', 'seq_start', 'seq_end']
training_df = pd.read_csv('generated/training_df.csv')
protein_set = set([s.strip() for s in ','.join(training_df.target_id.dropna().unique()).split(',')])
def read_csv_by_chunks(file_path, sep, chunksize, protein_list, verbose=False, **kwargs):
df = None
for i, chunk in enumerate(pd.read_csv(file_path, sep=sep, chunksize=chunksize, **kwargs)):
chunk_protein = chunk.loc[chunk.uniprot_acc.isin(protein_list),:]
if df is None:
df = chunk_protein
else:
if chunk_protein.shape[0] > 0:
df = df.append(chunk_protein)
if verbose:
print("Chunk #",i,'has shape',chunk_protein.shape,"--",'Dataframe shape:',df.shape)
return df
df = read_csv_by_chunks(file_path, sep, 1e7, protein_list, True, usecols=rcols)
df.to_csv('generated/pfam_training_df.csv')
def read_csv_by_chunks(file_path, sep, chunksize, protein_list, verbose=False, **kwargs):
df = None
for i, chunk in enumerate(pd.read_csv(file_path, sep=sep, chunksize=chunksize, **kwargs)):
chunk_protein = chunk.loc[chunk.uniprot_acc.isin(protein_list), :]
if df is None:
df = chunk_protein
else:
if chunk_protein.shape[0] > 0:
df = df.append(chunk_protein)
if verbose:
print("Chunk #", i, 'has shape', chunk_protein.shape, "--", 'Dataframe shape:', df.shape)
return df
#df = pd.read_csv('generated/pfam_training_df.csv')
df = read_csv_by_chunks(file_path, sep, 1e7, protein_set, True, usecols=rcols)
uniprot_accession_url = lambda accessions: 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&accession='+','.join(accessions)
df.to_csv('generated/pfam_training_df.csv')
uniprot_accession_url = lambda \
accessions: 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&accession=' + ','.join(accessions)
from itertools import chain
import requests
def get_json_from_uniprot(acc_list):
rq = requests.get(uniprot_accession_url(acc_list), headers={'Accept': 'application/json'})
dct = json.loads(rq.content)
return dct
rq = requests.get(uniprot_accession_url(acc_list), headers={'Accept': 'application/json'})
dct = json.loads(rq.content)
return dct
protein_list = list(protein_set)
full_entries = list(chain(*[get_json_from_uniprot(protein_list[i:i + 100]) for i in range(0, len(protein_list), 100)]))
with open('generated/uniprot_df.json', 'w') as f:
json.dump(full_entries, f)
json.dump(full_entries, f)
#uniprot_fields = ['accession','sequence.sequence']
#uniprot_df = pd.io.json.json_normalize(full_entries)
# uniprot_fields = ['accession','sequence.sequence']
# uniprot_df = pd.io.json.json_normalize(full_entries)
with open('generated/uniprot_df.json', 'r') as f:
uniprot_json = json.load(f)
uniprot_json = json.load(f)
def get_dict_from_json(obj):
feature_keys = ['DOMAIN','NP_BIND','MOTIF','ACT_SITE','BINDING','MOD_RES','MUTAGEN']
db_keys = ['GO','InterPro','Pfam','PROSITE','SUPFAM']
return {
'sequence': obj['sequence']['sequence'],
'features': [item for item in obj['features'] if item['type'] in feature_keys] if 'features' in obj.keys() else None,
'dbReferences': [item for item in obj['dbReferences'] if item['type'] in db_keys] if 'dbReferences' in obj.keys() else None
}
feature_keys = ['DOMAIN', 'NP_BIND', 'MOTIF', 'ACT_SITE', 'BINDING', 'MOD_RES', 'MUTAGEN']
db_keys = ['GO', 'InterPro', 'Pfam', 'PROSITE', 'SUPFAM']
return {
'sequence': obj['sequence']['sequence'],
'features': [item for item in obj['features'] if
item['type'] in feature_keys] if 'features' in obj.keys() else None,
'dbReferences': [item for item in obj['dbReferences'] if
item['type'] in db_keys] if 'dbReferences' in obj.keys() else None
}
uniprot_df_seqs.index.unique()
uniprot_seqs = {item['accession']:get_dict_from_json(item) for item in uniprot_json}
uniprot_seqs = {item['accession']: get_dict_from_json(item) for item in uniprot_json}
uniprot_df_seqs = pd.DataFrame.from_dict(uniprot_seqs).T
......@@ -71,10 +76,10 @@ unique_ids = list(chain(*uniprot_df_seqs.dbReferences.apply(lambda x: [y['id'] f
acc_to_identifiers = {}
for acc in uniprot_df_seqs.index:
empty_vec = [0]*len(unique_ids)
for entry in uniprot_df_seqs.loc[acc, 'dbReferences']:
empty_vec[unique_ids.index(entry['id'])] = 1
acc_to_identifiers[acc] = empty_vec
empty_vec = [0] * len(unique_ids)
for entry in uniprot_df_seqs.loc[acc, 'dbReferences']:
empty_vec[unique_ids.index(entry['id'])] = 1
acc_to_identifiers[acc] = empty_vec
acc_to_identifiers_df = pd.DataFrame(acc_to_identifiers).T
acc_to_identifiers_df.columns = unique_ids
......@@ -83,14 +88,57 @@ acc_to_identifiers_df.to_csv('generated/accession_to_identifiers_df.csv')
domain_sequences = []
for acc, pfam, start, end in list(zip(df.uniprot_acc, df.pfamA_acc, df.seq_start, df.seq_end)):
if acc in uniprot_df_seqs.index:
sequence = uniprot_df_seqs.loc[acc,'sequence'][start-1:end]
domain_sequences.append((acc, pfam, sequence))
if acc in uniprot_df_seqs.index:
sequence = uniprot_df_seqs.loc[acc, 'sequence'][start - 1:end]
domain_sequences.append((acc, pfam, sequence))
domain_df = pd.DataFrame(domain_sequences, columns=['uniprot_acc','pfamA_acc','domain_sequence'])
domain_df = pd.DataFrame(domain_sequences, columns=['uniprot_acc', 'pfamA_acc', 'domain_sequence'])
domain_df.to_csv('generated/domain_sequences.csv')
#
# test_template = pd.read_csv('dream/round_1_template.csv')
# uniprot_df_seqs.loc[test_template.UniProt_Id.unique(),:].features.apply()
# domain_df_test = domain_df[domain_df.uniprot_acc.isin(test_template.UniProt_Id.unique())]
\ No newline at end of file
# domain_df_test = domain_df
# TODO: THIS SHOULD BE A SEPARATE SCRIPT
training_df = pd.read_csv('generated/training_df.csv')
domain_df = pd.read_csv('generated/domain_sequences.csv')
acc_to_identifiers_df = pd.read_csv('generated/accession_to_identifiers_df.csv')
with open('generated/uniprot_df.json', 'r') as f:
uniprot_json = json.load(f)
uniprot_seqs = {item['accession']: get_dict_from_json(item) for item in uniprot_json}
uniprot_df = pd.DataFrame.from_dict(uniprot_seqs).T
pfam_df = pd.read_csv('generated/pfam_training_df.csv')
comp_prot_combinations = training_df[['compound_id','target_id']].drop_duplicates()
comp_prot_combinations['affinity_zinc'] = numpy.nan
def get_mappings_from_unichem(identifier, source=1, dest=9):
unichem_url = "https://www.ebi.ac.uk/unichem/rest/src_compound_id/"+identifier+"/"+str(source)
req_res = requests.get(unichem_url)
matches = [item['src_compound_id'] for item in eval(req_res.content.decode('utf-8')) if item['src_id'] == str(dest)]
return matches[0] if len(matches) > 0 else None
def observations_from_zinc(zinc_id):
zinc_url = 'http://zinc15.docking.org/substances/'+zinc_id+'/observations.json:affinity+ortholog.uniprot2'
req_res = requests.get(zinc_url)
return json.loads(req_res.content.decode('utf-8'))
for comp in comp_prot_combinations.compound_id.unique():
targets = comp_prot_combinations.loc[comp_prot_combinations.compound_id == comp,'target_id']
zinc_id = get_mappings_from_unichem(comp)
if zinc_id:
print('Query:',comp,zinc_id)
observations = observations_from_zinc(zinc_id)
#obs_dict = {observation['ortholog.uniprot']: observation['affinity'] for observation in observations if observation['ortholog.uniprot'] in targets}
if observations is not None or len(observations) < 1:
for obs in observations:
protein, affinity = obs['ortholog.uniprot'], obs['affinity']
hasComp = comp_prot_combinations.compound_id == comp
hasProt = comp_prot_combinations.target_id == protein
comp_prot_combinations.loc[hasComp & hasProt, 'affinity_zinc'] = affinity
......@@ -35,23 +35,27 @@ for descriptor, df in zip(descriptors, descriptor_dfs):
final_descriptor_df = pd.concat([df.set_index('smiles') for df in tdfs], axis=1, join='inner').reset_index()
training_df = pd.merge(finaldf, final_descriptor_df, on='smiles', how='right').dropna(how='all', axis=1)[[col for col in training_df.columns if col not in extra_cols]]
training_df = pd.merge(finaldf, final_descriptor_df, on='smiles', how='right').dropna(how='all', axis=1)
training_df = training_df[[col for col in training_df.columns if col not in extra_cols]]
good_units = training_df.standard_units == 'NM'
has_values = ~training_df.standard_value.isnull()
training_df = training_df[good_units & has_values & ~training_df.target_id.isna()]
baddf = training_df[training_df.target_id.apply(lambda x: ',' in x)]
subdf_correct = training_df[~training_df.target_id.apply(lambda x: ',' in x)]
def replace_value_in_row(row, id, sep=','):
items = [s.strip() for s in row[id].split(sep)]
df_to_add = [row]*len(items)
for item, row in zip(items,df_to_add):
row[id] = item
return df_to_add
training_df = training_df[~training_df.target_id.apply(lambda x: ',' in x)]
# TODO: fix rows with multiple values on target_id
# baddf = training_df[training_df.target_id.apply(lambda x: ',' in x)]
#
# subdf_correct = training_df[]
#
#
# def replace_value_in_row(row, id, sep=','):
# items = [s.strip() for s in row[id].split(sep)]
# df_to_add = [row]*len(items)
# for item, row in zip(items,df_to_add):
# row[id] = item
# return df_to_add
# new_rows = list(chain(*baddf.apply(lambda x: replace_value_in_row(row=x, id='target_id'), axis=1)))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment