Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 2e4211b8 authored by Vítor Vieira's avatar Vítor Vieira
Browse files

[FIX] Uniprot/PFAM retrieval and parsing

parent 00786a8f
......@@ -21,17 +21,17 @@ def read_csv_by_chunks(file_path, sep, chunksize, protein_list, verbose=False, *
print("Chunk #",i,'has shape',chunk_protein.shape,"--",'Dataframe shape:',df.shape)
return df
df = read_csv_by_chunks(file_path, sep, 1e7, protein_list, True, usecols=rcols)
df = read_csv_by_chunks(file_path, sep, 1e7, protein_set, True, usecols=rcols)
df.to_csv('generated/pfam_training_df.csv')
#df = pd.read_csv('generated/pfam_training_df.csv')
uniprot_accession_url = lambda accessions: 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&accession='+','.join(accessions)
from itertools import chain
import requests
def get_json_from_uniprot(acc_list):
rq = requests.get(uniprot_accession_url(acc_list), headers={'Accept': 'application/json'})
......@@ -62,7 +62,6 @@ def get_dict_from_json(obj):
'dbReferences': [item for item in obj['dbReferences'] if item['type'] in db_keys] if 'dbReferences' in obj.keys() else None
}
uniprot_df_seqs.index.unique()
uniprot_seqs = {item['accession']:get_dict_from_json(item) for item in uniprot_json}
uniprot_df_seqs = pd.DataFrame.from_dict(uniprot_seqs).T
......@@ -89,6 +88,7 @@ for acc, pfam, start, end in list(zip(df.uniprot_acc, df.pfamA_acc, df.seq_start
domain_df = pd.DataFrame(domain_sequences, columns=['uniprot_acc','pfamA_acc','domain_sequence'])
domain_df.to_csv('generated/domain_sequences.csv')
#
# test_template = pd.read_csv('dream/round_1_template.csv')
# uniprot_df_seqs.loc[test_template.UniProt_Id.unique(),:].features.apply()
......
......@@ -35,23 +35,27 @@ for descriptor, df in zip(descriptors, descriptor_dfs):
final_descriptor_df = pd.concat([df.set_index('smiles') for df in tdfs], axis=1, join='inner').reset_index()
training_df = pd.merge(finaldf, final_descriptor_df, on='smiles', how='right').dropna(how='all', axis=1)[[col for col in training_df.columns if col not in extra_cols]]
training_df = pd.merge(finaldf, final_descriptor_df, on='smiles', how='right').dropna(how='all', axis=1)
training_df = training_df[[col for col in training_df.columns if col not in extra_cols]]
good_units = training_df.standard_units == 'NM'
has_values = ~training_df.standard_value.isnull()
training_df = training_df[good_units & has_values & ~training_df.target_id.isna()]
baddf = training_df[training_df.target_id.apply(lambda x: ',' in x)]
subdf_correct = training_df[~training_df.target_id.apply(lambda x: ',' in x)]
def replace_value_in_row(row, id, sep=','):
items = [s.strip() for s in row[id].split(sep)]
df_to_add = [row]*len(items)
for item, row in zip(items,df_to_add):
row[id] = item
return df_to_add
training_df = training_df[~training_df.target_id.apply(lambda x: ',' in x)]
# TODO: fix rows with multiple values on target_id
# baddf = training_df[training_df.target_id.apply(lambda x: ',' in x)]
#
# subdf_correct = training_df[]
#
#
# def replace_value_in_row(row, id, sep=','):
# items = [s.strip() for s in row[id].split(sep)]
# df_to_add = [row]*len(items)
# for item, row in zip(items,df_to_add):
# row[id] = item
# return df_to_add
# new_rows = list(chain(*baddf.apply(lambda x: replace_value_in_row(row=x, id='target_id'), axis=1)))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment