Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 32cb77d0 authored by Vítor Vieira's avatar Vítor Vieira
Browse files

[ADD] Alignment features code + data

parent 8cc120e8
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
......@@ -7,8 +7,12 @@ file_path, sep = '/home/skapur/Pfam-A.regions.uniprot.tsv.gz', '\t'
rcols = ['uniprot_acc', 'seq_version', 'pfamA_acc', 'seq_start', 'seq_end']
training_df = pd.read_csv('generated/training_df.csv')
r1_template = pd.read_csv('dream/round_1_template.csv')
protein_set = set([s.strip() for s in ','.join(training_df.target_id.dropna().unique()).split(',')]) | \
set(r1_template.UniProt_Id.unique())
protein_set = set([s.strip() for s in ','.join(training_df.target_id.dropna().unique()).split(',')])
def read_csv_by_chunks(file_path, sep, chunksize, protein_list, verbose=False, **kwargs):
......@@ -95,6 +99,7 @@ for acc, pfam, start, end in list(zip(df.uniprot_acc, df.pfamA_acc, df.seq_start
domain_df = pd.DataFrame(domain_sequences, columns=['uniprot_acc', 'pfamA_acc', 'domain_sequence'])
domain_df.to_csv('generated/domain_sequences.csv')
#
# test_template = pd.read_csv('dream/round_1_template.csv')
# uniprot_df_seqs.loc[test_template.UniProt_Id.unique(),:].features.apply()
......@@ -116,29 +121,113 @@ uniprot_df = pd.DataFrame.from_dict(uniprot_seqs).T
pfam_df = pd.read_csv('generated/pfam_training_df.csv')
comp_prot_combinations = training_df[['compound_id','target_id']].drop_duplicates()
comp_prot_combinations['affinity_zinc'] = numpy.nan
def get_mappings_from_unichem(identifier, source=1, dest=9):
unichem_url = "https://www.ebi.ac.uk/unichem/rest/src_compound_id/"+identifier+"/"+str(source)
req_res = requests.get(unichem_url)
matches = [item['src_compound_id'] for item in eval(req_res.content.decode('utf-8')) if item['src_id'] == str(dest)]
return matches[0] if len(matches) > 0 else None
def observations_from_zinc(zinc_id):
zinc_url = 'http://zinc15.docking.org/substances/'+zinc_id+'/observations.json:affinity+ortholog.uniprot2'
req_res = requests.get(zinc_url)
return json.loads(req_res.content.decode('utf-8'))
for comp in comp_prot_combinations.compound_id.unique():
targets = comp_prot_combinations.loc[comp_prot_combinations.compound_id == comp,'target_id']
zinc_id = get_mappings_from_unichem(comp)
if zinc_id:
print('Query:',comp,zinc_id)
observations = observations_from_zinc(zinc_id)
#obs_dict = {observation['ortholog.uniprot']: observation['affinity'] for observation in observations if observation['ortholog.uniprot'] in targets}
if observations is not None or len(observations) < 1:
for obs in observations:
protein, affinity = obs['ortholog.uniprot'], obs['affinity']
hasComp = comp_prot_combinations.compound_id == comp
hasProt = comp_prot_combinations.target_id == protein
comp_prot_combinations.loc[hasComp & hasProt, 'affinity_zinc'] = affinity
# comp_prot_combinations = training_df[['compound_id','target_id']].drop_duplicates()
# comp_prot_combinations['affinity_zinc'] = numpy.nan
#
# def get_mappings_from_unichem(identifier, source=1, dest=9):
# unichem_url = "https://www.ebi.ac.uk/unichem/rest/src_compound_id/"+identifier+"/"+str(source)
# req_res = requests.get(unichem_url)
# matches = [item['src_compound_id'] for item in eval(req_res.content.decode('utf-8')) if item['src_id'] == str(dest)]
# return matches[0] if len(matches) > 0 else None
#
# def observations_from_zinc(zinc_id):
# zinc_url = 'http://zinc15.docking.org/substances/'+zinc_id+'/observations.json:affinity+ortholog.uniprot'
# req_res = requests.get(zinc_url)
# return json.loads(req_res.content.decode('utf-8'))
#
# for comp in comp_prot_combinations.compound_id.unique():
# targets = comp_prot_combinations.loc[comp_prot_combinations.compound_id == comp,'target_id']
# zinc_id = get_mappings_from_unichem(comp)
# if zinc_id:
# print('Query:',comp,zinc_id)
# observations = observations_from_zinc(zinc_id)
# #obs_dict = {observation['ortholog.uniprot']: observation['affinity'] for observation in observations if observation['ortholog.uniprot'] in targets}
# if observations is not None or len(observations) < 1:
# for obs in observations:
# protein, affinity = obs['ortholog.uniprot'], obs['affinity']
# hasComp = comp_prot_combinations.compound_id == comp
# hasProt = comp_prot_combinations.target_id == protein
# comp_prot_combinations.loc[hasComp & hasProt, 'affinity_zinc'] = affinity
#
# get_mappings_from_unichem(training_df.compound_id.unique()[2])
#
# observations_from_zinc('ZINC000058655571+ZINC000000001395')
pfam_mapping = pd.read_csv('data/Pfam-A.clans.tsv', sep='\t', header=None)
pfam_mapping.columns = ['domain','clan','name','protein_name','description']
pfam_mapping = pfam_mapping.set_index('domain')
domain_df['clan'] = pfam_mapping.loc[domain_df.pfamA_acc, 'clan'].values
#r1_template = pd.read_csv('dream/round_1_template.csv')
r1_proteins = r1_template.UniProt_Id.unique()
pfam_proteins = domain_df[domain_df['clan'] == 'CL0016'].uniprot_acc.unique()
set(pfam_proteins) & protein_set
missing = set(r1_proteins) - set(pfam_proteins)
len(r1_proteins)
sub_domain_df = domain_df[domain_df['clan'] == 'CL0016'].reset_index()
# to_remove_list = [
# 'Q16832',
# 'Q08345',
# 'Q9BVS4',
# 'Q9BRS2',
# 'O14730'
# ]
domains_to_add = sub_domain_df.pfamA_acc.value_counts()[sub_domain_df.pfamA_acc.value_counts() > 4].index.values
sub_domain_df = sub_domain_df[sub_domain_df.pfamA_acc.isin(domains_to_add)]
# sub_domain_df = sub_domain_df[~sub_domain_df.uniprot_acc.isin(to_remove_list)]
seqs_to_align = sub_domain_df.values
def get_fasta_string(id, seq):
return '>seq'+id+"\n"+seq
fasta_lines = [get_fasta_string('|'.join([str(x) for x in seq_obj[:3]]), seq_obj[3]) for seq_obj in seqs_to_align]
with open('data/domain_seqs.fasta','w') as f:
f.write('\n'.join(fasta_lines))
from Bio import AlignIO
import numpy as np
from sklearn.feature_selection import VarianceThreshold
#mpalign = AlignIO.read('data/clustalo-E20181114-111527-0141-93210866-p2m.clustal_num','clustal')
mpalign = AlignIO.read('data/clustalo-I20181114-160345-0776-47823852-p1m.clustal_num','clustal')
def get_features_from_alignment(mpalign):
from sklearn.preprocessing import OneHotEncoder
align_array = np.array([list(str(seq_record.seq)) for seq_record in mpalign])
seq_ids = [seq_record.id for seq_record in mpalign]
ohe = OneHotEncoder()
seq_features = ohe.fit_transform(align_array)
columns = [(i, name) for i, name in enumerate(ohe.get_feature_names()) if '_-' not in name]
column_indexes, column_names = list(zip(*columns))
seq_features_no_gaps = seq_features[:,column_indexes]
seq_features_df = pd.DataFrame(seq_features_no_gaps.toarray()).astype(int)
seq_features_df.index = seq_ids
seq_features_df.columns = column_names
return seq_features_df
seq_features_df = get_features_from_alignment(mpalign)
strf = []
for i in range(20):
p = 0.01 * i
vt = VarianceThreshold(threshold=p*(1-p))
vt.fit(seq_features_df)
cols = seq_features_df.columns[vt.get_support()]
strf.append(str(p)+"|"+str(len(cols))+"|"+" ".join(cols))
with open('test_vt','w') as f:
f.write('\n'.join(strf))
seq_features_df.to_csv('generated/multiple_align_features.csv')
seq_features_df.x0_C.value_counts()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment