Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 45323fdd authored by Diogo Batista Lima's avatar Diogo Batista Lima
Browse files

Upload New File

parent 86ac1d93
def generate_discardable_iterator(collection):
for i in collection:
yield i
def read_tab_delimited_count_file(filename):
# input is a txt file with an identifying string separated by spaces
# and the respective count separated by a tab
# output is a dictionary; the keys match each category and the values are the respective count
gst={}
file=open(filename,"r")
data=file.readlines()
file.close()
data_iterator=generate_discardable_iterator(data[1:])
for category_count in data_iterator:
info=category_count.split("\t")
gst[info[0].rstrip()]=int(info[1].replace("\n","").replace(",","").rstrip())
return gst
def group_counts_by_category(dict_counts,hst=False):
# input is a dictionary generated by read_tab_delimited_count_file()
# output is another dictionary with entries grouped by category
# example: "Expression profiling by array" and "Expression profiling by SAGE"'s counts
# should be grouped in the same category "Expression profiling"
# if hst parameter is True, the output also counts high throughput sequencing data by category
# The output dictionary's values are a 2-value list with the number of overall counts and counts with hts
categories={"Expression profiling":[0,0],"Genome variation profiling":[0,0],"Genome binding/occupancy profiling":[0,0],
"Methylation profiling":[0,0],"Protein profiling":[0,0],"Non-coding RNA profiling":[0,0],"Other":[0,0]}
seen=[]
for k in categories.keys():
for type in dict_counts.keys():
found = False
count=int(dict_counts[type])
if k in type:
found=True
if not hst:
categories[k][0]+=count
else:
if "high throughput sequencing" in type:
categories[k][1]+=count
else:
categories[k][0]+=count
if not found and type not in seen:
categories["Other"][0]+=count
seen.append(type)
return categories
def group_data_by_high_throughput_sequencing(dict_counts):
# input is a dictionary generated by read_tab_delimited_count_file()
# output is a dictionary with two keys; high throughput sequencing
# and non-hts, the associated values are counts of each category
hst_counts={"High throughput sequencing data":0,"Array data":0}
exceptions=["Other","Third-party reanalysis"]
category_to_group_by="high throughput sequencing"
dict_keys=generate_discardable_iterator(dict_counts.keys())
for k in dict_keys:
count=int(dict_counts[k])
if category_to_group_by in k and k not in exceptions:
hst_counts["High throughput sequencing data"]+=count
elif k not in exceptions:
hst_counts["Array data"]+=count
return hst_counts
def get_list_of_bacteria_genus(filename):
# input is a filename with names of bacteria, 1 name for 1 line
# output is a list of bacteria genus names without duplicates
file=open(filename,"r")
genus=file.readlines()
file.close()
gen=generate_discardable_iterator(range(len(genus)))
for g in gen:
genus[g]=genus[g].rstrip()
return list(set(genus))
def get_dict_of_bacteria(gso_dict,list_of_bacteria):
# input is a dictionary generated by read_tab_delimited_count_file with organism info
# output is a dictionary with bacteria only information
bacteria_dict={}
non_bacteria_number_of_series=0
gso_dict_keys=generate_discardable_iterator(gso_dict.keys())
for organism in gso_dict_keys:
if organism.split(" ")[0] in list_of_bacteria:
bacteria_dict[organism]=gso_dict[organism]
else:
non_bacteria_number_of_series+=gso_dict[organism]
return bacteria_dict,non_bacteria_number_of_series
def get_most_represented_bacteria(bacteria_dict):
# input is a dictionary of bacteria series count generated by get_dict_of_bacteria()
# output is a sorted list of tuples with the most represented bacteria
return sorted(list(bacteria_dict.items()), key=lambda tup: tup[1],reverse=True)
def merge_bacteria_strains(bacteria_dict):
# input is a dictionary of bacteria series count generated by get_dict_of_bacteria()
# different strains of the same species are merged (as well as its counts)
# output is a dictionary structured similarly to the input
merged_bacteria_dict={}
items=bacteria_dict.items()
items_iter=generate_discardable_iterator(items)
for i in items_iter:
genus_without_strain=" ".join(i[0].split(" ")[:2])
if genus_without_strain not in merged_bacteria_dict:
merged_bacteria_dict[genus_without_strain]=i[1]
else:
merged_bacteria_dict[genus_without_strain]+=i[1]
return merged_bacteria_dict
def get_geo_data_over_years(filename):
# input is the file with geo data over the years
# output is a sorted list of tuples with the formatted years and associated dataset count
data_years=[]
file=open(filename,"r")
data=file.readlines()
file.close()
data_iterator=generate_discardable_iterator(data[1:])
quarters={"1":0.2,"2":0.4,"3":0.6,"4":0.8}
for stats in data_iterator:
info=stats.split("\t")
time=int(info[0].rstrip())+quarters[info[1].rstrip()]
data_years.append((time,int(info[2].replace(",","").rstrip())))
return sorted(data_years, key=lambda tup: tup[0])
def obtain_genus_from_species(species_dict):
# input is a dictionary of bacteria series count generated by merge_bacteria_strains()
# output is a new dictionary with genus counts instead
new_genus_dict={}
species=list(species_dict.keys())
species_iter=generate_discardable_iterator(species)
for sp in species_iter:
genus=sp.split(" ")[0]
if genus in new_genus_dict:
new_genus_dict[genus]+=species_dict[sp]
else:
new_genus_dict[genus]=species_dict[sp]
return new_genus_dict
def read_taxonomy(filename):
# taxonomy file downloaded from
# https://www.itis.gov/hierarchy.html
# input is a file with taxonomy information downloaded from the previous url
# output is a list
# each element is a taxonomic classification
f=open(filename,"r")
file=f.readlines()
f.close()
file_iter=generate_discardable_iterator(file)
taxa=[]
for line in file_iter:
taxa.append(line.strip())
taxa=taxa[4:]
return taxa
def get_taxa(taxa_file,taxa_to_count="Family",sub_taxa_to_count="Genus"):
# input is the taxa file generated by read_taxonomy
# taxa_to_count is the taxa by which we want to organize all sub taxas
# sub_taxa_to_count is the sub_taxa to be organized in each taxa
# output is a dictionary with taxonomic groups
# example:
# if input is taxa_file,"Family","Genus"
# the output is:
# {"Acidobacteria":["Bryobacter","Acidicapsa","Acidobacterium"......], "Family 2": [Genus X, Genus Y]}
collections=[]
taxa={}
size=len(taxa_file)
for t in range(len(taxa_file)):
if taxa_to_count in taxa_file[t]:
group=[taxa_file[t]]
for i in range(t+1,len(taxa_file)):
if taxa_to_count in taxa_file[i] or i+1==size:
if i+1==size:
group.append(taxa_file[i])
collections.append(group)
break
else:
group.append(taxa_file[i])
collections_iter=generate_discardable_iterator(collections)
for c in collections_iter:
sub_taxa=[]
for tax in c:
if sub_taxa_to_count+": " in tax:
sub_taxa.append(tax.split(":")[1].strip())
taxa[c[0].split(":")[1].strip()]=sub_taxa
return taxa
def get_counts_by_taxa(bacteria_counts,taxonomy):
# input is the dictionary with bacterial counts and
# the taxonomy dictionary
# output is a dictionary with the number of series for each
# bacterial selected taxa (such as phyla or family)
res={}
for tup in bacteria_counts.items():
for tax_tup in taxonomy.items():
taxa=tup[0]
taxa_count=tup[1]
superior_taxa=tax_tup[0]
list_of_sub_taxas=tax_tup[1]
if taxa in list_of_sub_taxas:
if superior_taxa in res:
res[superior_taxa]+=taxa_count
else:
res[superior_taxa]=taxa_count
return res
def find_most_representable_species_of_each_family(family_to_genus_dict,species_dict):
# output is the species with the most series for each family of bacteria and its respective number of series
res={}
for species_count_tup in species_dict.items():
specie=species_count_tup[0]
genus=species_count_tup[0].split(" ")[0]
species_count=species_count_tup[1]
for fam_to_gen_tup in family_to_genus_dict.items():
if genus in fam_to_gen_tup[1]:
fam=fam_to_gen_tup[0]
if fam not in res:
res[fam]=(specie,species_count)
break
else:
if species_count>res[fam][1]:
res[fam]=(specie,species_count)
break
return res
def read_ArrayExpress_file(AE_filename):
# parses the result of an array_express query
# returns a dictionary with data regarding
# bacterial species and the number of series with data for each species
file=open(AE_filename,"r")
AE_bacteria=[]
for line in file.readlines():
AE_bacteria.append(line.split("\t")[3])
file.close()
AE_bacteria=AE_bacteria[1:]
res={}
for bacteria in AE_bacteria:
if bacteria not in res:
res[bacteria]=AE_bacteria.count(bacteria)
return res
##### GEO
### Gene expression studies
# Type of expression studies on GEO
geo_series_type=read_tab_delimited_count_file("geo_series_type.txt")
# Expression studies grouped by categories
gst_categories=group_counts_by_category(geo_series_type)
# Expression studies grouped by categories divided by use of high throughput sequencing
gst_categories_with_hst=group_counts_by_category(geo_series_type,hst=True)
geo_hst=[]
geo_microarray=[]
for i in gst_categories_with_hst.values():
geo_microarray.append(i[0])
geo_hst.append(i[1])
# Usage of high throughput sequencing for expression studies
gst_hst=group_data_by_high_throughput_sequencing(geo_series_type)
### Organism studies
# Number of series with data for each species on GEO
geo_series_organisms=read_tab_delimited_count_file("geo_series_organisms.txt")
# A list of known bacteria species retrieved elsewhere than GEO.
# Its purpose is for finding which organisms on GEO are bacterial
list_bacteria=get_list_of_bacteria_genus("taxonomy.txt")
# Intermediate processing
intermediate_data=get_dict_of_bacteria(geo_series_organisms,list_bacteria)
# Number of series with data for each species (including different strains) of bacteria
bacteria_dict=intermediate_data[0]
# Number of series with data for organisms that are not bacterial
number_of_non_bacteria_series=intermediate_data[1]
# Number of series with data for bacterial organisms
number_of_bacteria_series=sum(bacteria_dict.values())
# Proportion of number of series with bacterial data when compared to non bacterial
bacteria_proportion=number_of_bacteria_series/number_of_non_bacteria_series*100
# Bacteria species (including different strains) and sorted by its number of series
most_represented_bacteria=get_most_represented_bacteria(bacteria_dict)
# Number of series with data for each species of bacteria
# Different strains are merged in the same specie
merged_bacteria=merge_bacteria_strains(bacteria_dict)
# Bacteria species sorted by its number of series
# Different strains are merged in the same specie
most_represented_merged_bacteria=get_most_represented_bacteria(merged_bacteria)
# Top 10 most represented bacteria species on GEO
top10_bacteria=most_represented_merged_bacteria[:10]
# Evolution of number of series on GEO from 2012 to 2019
geo_data=get_geo_data_over_years("geo_data_over_years.txt")
years = []
number_of_series = []
for i in geo_data:
years.append(i[0])
number_of_series.append(i[1])
# Number of bacteria species for which data is available at GEO
number_of_analysed_bacterial_genomes=len(most_represented_bacteria)
### Taxonomy study
# Taxonomy database retrieved from a text file
taxonomy=(read_taxonomy("Bacteria_Genus.txt"))
# Bacteria available in the taxonomy database grouped by Family and Phylum
bacteria_family=get_taxa(taxonomy,"Family")
bacteria_phylum=get_taxa(taxonomy,"Phylum")
# Number of series with data for each genus of bacteria
merged_bacteria_by_genus=obtain_genus_from_species(merged_bacteria)
# Number of series with data for each family or phylum of bacteria
taxa_counts_family=get_counts_by_taxa(merged_bacteria_by_genus,bacteria_family)
taxa_counts_phylum=get_counts_by_taxa(merged_bacteria_by_genus,bacteria_phylum)
# Number of different families and phyla available on GEO
#print("O GEO tem dados relativos a",len(taxa_counts_family.keys()),"familias diferentes")
#print("O GEO tem dados relativos a",len(taxa_counts_phylum.keys()),"filos diferentes")
# List of phyla available in GEO (for which there are series with data)
list_of_phyla_in_geo=list(taxa_counts_phylum.keys())
# Most representable specie of each family available at GEO and its number of series
most_representable_species_of_each_family=find_most_representable_species_of_each_family(bacteria_family,merged_bacteria)
data=list(most_representable_species_of_each_family.items())
final_data=sorted(data, key=lambda tup: tup[1][1],reverse=True)[:21]
# Phylogenetic tree: contextualization
# first we found the most representative species of each family
# then, we sorted that list and found the top 20 families with most representative species
# our goal on this stage is to design a phylogenetic tree with these families
# the most appropriate data for designing phylogenetic trees is 16S ribosomal RNA
# Using NCBI, we manually collected 16S ribosomal RNA FASTa sequences for the top 20 species filtered beforehand
# These sequences were stored in a file
# Then we used EBI's Clustal Omega's web tool to build a phylogenetic tree from this data, using default parameters
# The tree was further refined with relevant information such as color coding by phyla, association of family
# with species and the respective number of series on GEO
##### ArrayExpress
### Gene expression studies
# Expression studies on ArrayExpress grouped by categories
# Data available only on ArrayExpress (not other EBI databases)
# Expression studies grouped by categories (number of experiments) on ArrayExpress
AE_gene_expression_studies_NGS={"RNA-Seq":2344,"ChIP-seq":432,"GRO-seq":7,"NET-seq":1,"Ribo-seq":2,
"Hi-C":9,"BS-seq":2,"ATAC-Seq":20,"RIP-Seq":26}
AE_gene_expression_studies_Microarray={"Transcription profiling":8150,
"Comparative genomic hybridization by array":417,
"Methylation profiling":150,
"Proteomic profiling by array":22,"Genotyping by array":151,"Other":644}
AE_ngs_microarray={"RNA-Seq":0,"ChIP-seq":0,"Transcription profiling":0,
"Comparative genomic hybridization by array":0,"Other":0}
other_ngs=0
other_microarray=644
for k in AE_gene_expression_studies_NGS:
if k in AE_ngs_microarray:
AE_ngs_microarray[k]+=AE_gene_expression_studies_NGS[k]
else:
other_ngs+=AE_gene_expression_studies_NGS[k]
for k2 in AE_gene_expression_studies_Microarray:
if k2 in AE_ngs_microarray:
AE_ngs_microarray[k2]+=AE_gene_expression_studies_Microarray[k2]
else:
other_microarray+=AE_gene_expression_studies_Microarray[k2]
# Usage of high throughput sequencing for expression studies on ArrayExpress
AE_number_of_NGS_GE_studies=sum(AE_gene_expression_studies_NGS.values())
AE_number_of_Microarray_studies=sum(AE_gene_expression_studies_Microarray.values())
# Number of series with data for each species (including different strains) of bacteria at ArrayExpress
AE_supposed_bacterial_species_count=read_ArrayExpress_file("ArrayExpress-Experiments-Bacteria.txt")
AE_bacterial_species_count=get_dict_of_bacteria(AE_supposed_bacterial_species_count,list_bacteria)[0]
# Number of series with data for each species of bacteria on ArrayExpress
# Different strains are merged in the same specie
AE_merged_bacteria=merge_bacteria_strains(AE_bacterial_species_count)
# Number of series with data for bacterial organisms on ArrayExpress
AE_bacterial_data=sum(AE_merged_bacteria.values())
# Number of series with data for organisms that are not bacterial on ArrayExpress
# 12375 is the total number of series exclusively on ArrayExpress (Filtered by AE only on the platform's search engine)
AE_non_bacterial_data=12375-AE_bacterial_data
# Proportion of number of series with bacterial data when compared to non bacterial on ArrayExpress
AE_bacterial_proportion=round(AE_bacterial_data/AE_non_bacterial_data*100)
# Bacteria species (including different strains) and sorted by its number of series on ArrayExpress
most_represented_AE_bacteria=get_most_represented_bacteria(AE_merged_bacteria)
# Top 10 most represented bacteria species on ArrayExpress
top_10_AE_bacteria=most_represented_AE_bacteria[:10]
# Number of bacteria species for which data is available at ArrayExpress
AE_number_of_analysed_bacterial_genomes=len(most_represented_AE_bacteria)
### PROCESSING IS COMPLETE
### DATA IS COLLECTED AND READY TO BE VISUALIZED
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment