Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 9f1956b1 authored by Vítor Vieira's avatar Vítor Vieira
Browse files

[ADD] ZINC affinity

parent 2e4211b8
import pandas as pd import pandas as pd
import json import json
import numpy
import requests
file_path, sep = '/home/skapur/Pfam-A.regions.uniprot.tsv.gz', '\t' file_path, sep = '/home/skapur/Pfam-A.regions.uniprot.tsv.gz', '\t'
rcols = ['uniprot_acc','seq_version','pfamA_acc','seq_start','seq_end'] rcols = ['uniprot_acc', 'seq_version', 'pfamA_acc', 'seq_start', 'seq_end']
training_df = pd.read_csv('generated/training_df.csv') training_df = pd.read_csv('generated/training_df.csv')
protein_set = set([s.strip() for s in ','.join(training_df.target_id.dropna().unique()).split(',')]) protein_set = set([s.strip() for s in ','.join(training_df.target_id.dropna().unique()).split(',')])
def read_csv_by_chunks(file_path, sep, chunksize, protein_list, verbose=False, **kwargs): def read_csv_by_chunks(file_path, sep, chunksize, protein_list, verbose=False, **kwargs):
df = None df = None
for i, chunk in enumerate(pd.read_csv(file_path, sep=sep, chunksize=chunksize, **kwargs)): for i, chunk in enumerate(pd.read_csv(file_path, sep=sep, chunksize=chunksize, **kwargs)):
chunk_protein = chunk.loc[chunk.uniprot_acc.isin(protein_list),:] chunk_protein = chunk.loc[chunk.uniprot_acc.isin(protein_list), :]
if df is None: if df is None:
df = chunk_protein df = chunk_protein
else: else:
if chunk_protein.shape[0] > 0: if chunk_protein.shape[0] > 0:
df = df.append(chunk_protein) df = df.append(chunk_protein)
if verbose: if verbose:
print("Chunk #",i,'has shape',chunk_protein.shape,"--",'Dataframe shape:',df.shape) print("Chunk #", i, 'has shape', chunk_protein.shape, "--", 'Dataframe shape:', df.shape)
return df return df
df = read_csv_by_chunks(file_path, sep, 1e7, protein_set, True, usecols=rcols) df = read_csv_by_chunks(file_path, sep, 1e7, protein_set, True, usecols=rcols)
df.to_csv('generated/pfam_training_df.csv') df.to_csv('generated/pfam_training_df.csv')
uniprot_accession_url = lambda \
accessions: 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&accession=' + ','.join(accessions)
uniprot_accession_url = lambda accessions: 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&accession='+','.join(accessions)
from itertools import chain from itertools import chain
import requests import requests
def get_json_from_uniprot(acc_list): def get_json_from_uniprot(acc_list):
rq = requests.get(uniprot_accession_url(acc_list), headers={'Accept': 'application/json'}) rq = requests.get(uniprot_accession_url(acc_list), headers={'Accept': 'application/json'})
dct = json.loads(rq.content) dct = json.loads(rq.content)
return dct return dct
protein_list = list(protein_set) protein_list = list(protein_set)
full_entries = list(chain(*[get_json_from_uniprot(protein_list[i:i + 100]) for i in range(0, len(protein_list), 100)])) full_entries = list(chain(*[get_json_from_uniprot(protein_list[i:i + 100]) for i in range(0, len(protein_list), 100)]))
with open('generated/uniprot_df.json', 'w') as f: with open('generated/uniprot_df.json', 'w') as f:
json.dump(full_entries, f) json.dump(full_entries, f)
# uniprot_fields = ['accession','sequence.sequence']
#uniprot_fields = ['accession','sequence.sequence'] # uniprot_df = pd.io.json.json_normalize(full_entries)
#uniprot_df = pd.io.json.json_normalize(full_entries)
with open('generated/uniprot_df.json', 'r') as f: with open('generated/uniprot_df.json', 'r') as f:
uniprot_json = json.load(f) uniprot_json = json.load(f)
def get_dict_from_json(obj): def get_dict_from_json(obj):
feature_keys = ['DOMAIN','NP_BIND','MOTIF','ACT_SITE','BINDING','MOD_RES','MUTAGEN'] feature_keys = ['DOMAIN', 'NP_BIND', 'MOTIF', 'ACT_SITE', 'BINDING', 'MOD_RES', 'MUTAGEN']
db_keys = ['GO','InterPro','Pfam','PROSITE','SUPFAM'] db_keys = ['GO', 'InterPro', 'Pfam', 'PROSITE', 'SUPFAM']
return { return {
'sequence': obj['sequence']['sequence'], 'sequence': obj['sequence']['sequence'],
'features': [item for item in obj['features'] if item['type'] in feature_keys] if 'features' in obj.keys() else None, 'features': [item for item in obj['features'] if
'dbReferences': [item for item in obj['dbReferences'] if item['type'] in db_keys] if 'dbReferences' in obj.keys() else None item['type'] in feature_keys] if 'features' in obj.keys() else None,
} 'dbReferences': [item for item in obj['dbReferences'] if
item['type'] in db_keys] if 'dbReferences' in obj.keys() else None
}
uniprot_seqs = {item['accession']:get_dict_from_json(item) for item in uniprot_json}
uniprot_seqs = {item['accession']: get_dict_from_json(item) for item in uniprot_json}
uniprot_df_seqs = pd.DataFrame.from_dict(uniprot_seqs).T uniprot_df_seqs = pd.DataFrame.from_dict(uniprot_seqs).T
...@@ -70,10 +76,10 @@ unique_ids = list(chain(*uniprot_df_seqs.dbReferences.apply(lambda x: [y['id'] f ...@@ -70,10 +76,10 @@ unique_ids = list(chain(*uniprot_df_seqs.dbReferences.apply(lambda x: [y['id'] f
acc_to_identifiers = {} acc_to_identifiers = {}
for acc in uniprot_df_seqs.index: for acc in uniprot_df_seqs.index:
empty_vec = [0]*len(unique_ids) empty_vec = [0] * len(unique_ids)
for entry in uniprot_df_seqs.loc[acc, 'dbReferences']: for entry in uniprot_df_seqs.loc[acc, 'dbReferences']:
empty_vec[unique_ids.index(entry['id'])] = 1 empty_vec[unique_ids.index(entry['id'])] = 1
acc_to_identifiers[acc] = empty_vec acc_to_identifiers[acc] = empty_vec
acc_to_identifiers_df = pd.DataFrame(acc_to_identifiers).T acc_to_identifiers_df = pd.DataFrame(acc_to_identifiers).T
acc_to_identifiers_df.columns = unique_ids acc_to_identifiers_df.columns = unique_ids
...@@ -82,15 +88,57 @@ acc_to_identifiers_df.to_csv('generated/accession_to_identifiers_df.csv') ...@@ -82,15 +88,57 @@ acc_to_identifiers_df.to_csv('generated/accession_to_identifiers_df.csv')
domain_sequences = [] domain_sequences = []
for acc, pfam, start, end in list(zip(df.uniprot_acc, df.pfamA_acc, df.seq_start, df.seq_end)): for acc, pfam, start, end in list(zip(df.uniprot_acc, df.pfamA_acc, df.seq_start, df.seq_end)):
if acc in uniprot_df_seqs.index: if acc in uniprot_df_seqs.index:
sequence = uniprot_df_seqs.loc[acc,'sequence'][start-1:end] sequence = uniprot_df_seqs.loc[acc, 'sequence'][start - 1:end]
domain_sequences.append((acc, pfam, sequence)) domain_sequences.append((acc, pfam, sequence))
domain_df = pd.DataFrame(domain_sequences, columns=['uniprot_acc','pfamA_acc','domain_sequence']) domain_df = pd.DataFrame(domain_sequences, columns=['uniprot_acc', 'pfamA_acc', 'domain_sequence'])
domain_df.to_csv('generated/domain_sequences.csv') domain_df.to_csv('generated/domain_sequences.csv')
# #
# test_template = pd.read_csv('dream/round_1_template.csv') # test_template = pd.read_csv('dream/round_1_template.csv')
# uniprot_df_seqs.loc[test_template.UniProt_Id.unique(),:].features.apply() # uniprot_df_seqs.loc[test_template.UniProt_Id.unique(),:].features.apply()
# domain_df_test = domain_df[domain_df.uniprot_acc.isin(test_template.UniProt_Id.unique())] # domain_df_test = domain_df
\ No newline at end of file
# TODO: THIS SHOULD BE A SEPARATE SCRIPT
training_df = pd.read_csv('generated/training_df.csv')
domain_df = pd.read_csv('generated/domain_sequences.csv')
acc_to_identifiers_df = pd.read_csv('generated/accession_to_identifiers_df.csv')
with open('generated/uniprot_df.json', 'r') as f:
uniprot_json = json.load(f)
uniprot_seqs = {item['accession']: get_dict_from_json(item) for item in uniprot_json}
uniprot_df = pd.DataFrame.from_dict(uniprot_seqs).T
pfam_df = pd.read_csv('generated/pfam_training_df.csv')
comp_prot_combinations = training_df[['compound_id','target_id']].drop_duplicates()
comp_prot_combinations['affinity_zinc'] = numpy.nan
def get_mappings_from_unichem(identifier, source=1, dest=9):
unichem_url = "https://www.ebi.ac.uk/unichem/rest/src_compound_id/"+identifier+"/"+str(source)
req_res = requests.get(unichem_url)
matches = [item['src_compound_id'] for item in eval(req_res.content.decode('utf-8')) if item['src_id'] == str(dest)]
return matches[0] if len(matches) > 0 else None
def observations_from_zinc(zinc_id):
zinc_url = 'http://zinc15.docking.org/substances/'+zinc_id+'/observations.json:affinity+ortholog.uniprot2'
req_res = requests.get(zinc_url)
return json.loads(req_res.content.decode('utf-8'))
for comp in comp_prot_combinations.compound_id.unique():
targets = comp_prot_combinations.loc[comp_prot_combinations.compound_id == comp,'target_id']
zinc_id = get_mappings_from_unichem(comp)
if zinc_id:
print('Query:',comp,zinc_id)
observations = observations_from_zinc(zinc_id)
#obs_dict = {observation['ortholog.uniprot']: observation['affinity'] for observation in observations if observation['ortholog.uniprot'] in targets}
if observations is not None or len(observations) < 1:
for obs in observations:
protein, affinity = obs['ortholog.uniprot'], obs['affinity']
hasComp = comp_prot_combinations.compound_id == comp
hasProt = comp_prot_combinations.target_id == protein
comp_prot_combinations.loc[hasComp & hasProt, 'affinity_zinc'] = affinity
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment