Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at

Commit e0c1948a authored by Vítor Vieira's avatar Vítor Vieira
Browse files

[ADD] Uniprot/PFAM retrieval and parsing

parent 68d5ee51
Pipeline #9 failed with stages
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import json
file_path, sep = '/home/skapur/Pfam-A.regions.uniprot.tsv.gz', '\t'
rcols = ['uniprot_acc','seq_version','pfamA_acc','seq_start','seq_end']
training_df = pd.read_csv('generated/training_df.csv')
protein_set = set([s.strip() for s in ','.join(training_df.target_id.dropna().unique()).split(',')])
def read_csv_by_chunks(file_path, sep, chunksize, protein_list, verbose=False, **kwargs):
df = None
for i, chunk in enumerate(pd.read_csv(file_path, sep=sep, chunksize=chunksize, **kwargs)):
chunk_protein = chunk.loc[chunk.uniprot_acc.isin(protein_list),:]
if df is None:
df = chunk_protein
if chunk_protein.shape[0] > 0:
df = df.append(chunk_protein)
if verbose:
print("Chunk #",i,'has shape',chunk_protein.shape,"--",'Dataframe shape:',df.shape)
return df
df = read_csv_by_chunks(file_path, sep, 1e7, protein_list, True, usecols=rcols)
#df = pd.read_csv('generated/pfam_training_df.csv')
uniprot_accession_url = lambda accessions: ''+','.join(accessions)
from itertools import chain
def get_json_from_uniprot(acc_list):
rq = requests.get(uniprot_accession_url(acc_list), headers={'Accept': 'application/json'})
dct = json.loads(rq.content)
return dct
protein_list = list(protein_set)
full_entries = list(chain(*[get_json_from_uniprot(protein_list[i:i + 100]) for i in range(0, len(protein_list), 100)]))
with open('generated/uniprot_df.json', 'w') as f:
json.dump(full_entries, f)
#uniprot_fields = ['accession','sequence.sequence']
#uniprot_df =
with open('generated/uniprot_df.json', 'r') as f:
uniprot_json = json.load(f)
def get_dict_from_json(obj):
db_keys = ['GO','InterPro','Pfam','PROSITE','SUPFAM']
return {
'sequence': obj['sequence']['sequence'],
'features': [item for item in obj['features'] if item['type'] in feature_keys] if 'features' in obj.keys() else None,
'dbReferences': [item for item in obj['dbReferences'] if item['type'] in db_keys] if 'dbReferences' in obj.keys() else None
uniprot_seqs = {item['accession']:get_dict_from_json(item) for item in uniprot_json}
uniprot_df_seqs = pd.DataFrame.from_dict(uniprot_seqs).T
unique_ids = list(chain(*uniprot_df_seqs.dbReferences.apply(lambda x: [y['id'] for y in x])))
acc_to_identifiers = {}
for acc in uniprot_df_seqs.index:
empty_vec = [0]*len(unique_ids)
for entry in uniprot_df_seqs.loc[acc, 'dbReferences']:
empty_vec[unique_ids.index(entry['id'])] = 1
acc_to_identifiers[acc] = empty_vec
acc_to_identifiers_df = pd.DataFrame(acc_to_identifiers).T
acc_to_identifiers_df.columns = unique_ids
domain_sequences = []
for acc, pfam, start, end in list(zip(df.uniprot_acc, df.pfamA_acc, df.seq_start, df.seq_end)):
if acc in uniprot_df_seqs.index:
sequence = uniprot_df_seqs.loc[acc,'sequence'][start-1:end]
domain_sequences.append((acc, pfam, sequence))
domain_df = pd.DataFrame(domain_sequences, columns=['uniprot_acc','pfamA_acc','domain_sequence'])
# test_template = pd.read_csv('dream/round_1_template.csv')
# uniprot_df_seqs.loc[test_template.UniProt_Id.unique(),:].features.apply()
# domain_df_test = domain_df[domain_df.uniprot_acc.isin(test_template.UniProt_Id.unique())]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment