Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Diogo Batista Lima
ProTReND
Commits
45323fdd
Commit
45323fdd
authored
Feb 05, 2019
by
Diogo Batista Lima
Browse files
Upload New File
parent
86ac1d93
Changes
1
Hide whitespace changes
Inline
Side-by-side
PACBB_Code/geo_arrayexpress.py
0 → 100644
View file @
45323fdd
def
generate_discardable_iterator
(
collection
):
for
i
in
collection
:
yield
i
def
read_tab_delimited_count_file
(
filename
):
# input is a txt file with an identifying string separated by spaces
# and the respective count separated by a tab
# output is a dictionary; the keys match each category and the values are the respective count
gst
=
{}
file
=
open
(
filename
,
"r"
)
data
=
file
.
readlines
()
file
.
close
()
data_iterator
=
generate_discardable_iterator
(
data
[
1
:])
for
category_count
in
data_iterator
:
info
=
category_count
.
split
(
"
\t
"
)
gst
[
info
[
0
].
rstrip
()]
=
int
(
info
[
1
].
replace
(
"
\n
"
,
""
).
replace
(
","
,
""
).
rstrip
())
return
gst
def
group_counts_by_category
(
dict_counts
,
hst
=
False
):
# input is a dictionary generated by read_tab_delimited_count_file()
# output is another dictionary with entries grouped by category
# example: "Expression profiling by array" and "Expression profiling by SAGE"'s counts
# should be grouped in the same category "Expression profiling"
# if hst parameter is True, the output also counts high throughput sequencing data by category
# The output dictionary's values are a 2-value list with the number of overall counts and counts with hts
categories
=
{
"Expression profiling"
:[
0
,
0
],
"Genome variation profiling"
:[
0
,
0
],
"Genome binding/occupancy profiling"
:[
0
,
0
],
"Methylation profiling"
:[
0
,
0
],
"Protein profiling"
:[
0
,
0
],
"Non-coding RNA profiling"
:[
0
,
0
],
"Other"
:[
0
,
0
]}
seen
=
[]
for
k
in
categories
.
keys
():
for
type
in
dict_counts
.
keys
():
found
=
False
count
=
int
(
dict_counts
[
type
])
if
k
in
type
:
found
=
True
if
not
hst
:
categories
[
k
][
0
]
+=
count
else
:
if
"high throughput sequencing"
in
type
:
categories
[
k
][
1
]
+=
count
else
:
categories
[
k
][
0
]
+=
count
if
not
found
and
type
not
in
seen
:
categories
[
"Other"
][
0
]
+=
count
seen
.
append
(
type
)
return
categories
def
group_data_by_high_throughput_sequencing
(
dict_counts
):
# input is a dictionary generated by read_tab_delimited_count_file()
# output is a dictionary with two keys; high throughput sequencing
# and non-hts, the associated values are counts of each category
hst_counts
=
{
"High throughput sequencing data"
:
0
,
"Array data"
:
0
}
exceptions
=
[
"Other"
,
"Third-party reanalysis"
]
category_to_group_by
=
"high throughput sequencing"
dict_keys
=
generate_discardable_iterator
(
dict_counts
.
keys
())
for
k
in
dict_keys
:
count
=
int
(
dict_counts
[
k
])
if
category_to_group_by
in
k
and
k
not
in
exceptions
:
hst_counts
[
"High throughput sequencing data"
]
+=
count
elif
k
not
in
exceptions
:
hst_counts
[
"Array data"
]
+=
count
return
hst_counts
def
get_list_of_bacteria_genus
(
filename
):
# input is a filename with names of bacteria, 1 name for 1 line
# output is a list of bacteria genus names without duplicates
file
=
open
(
filename
,
"r"
)
genus
=
file
.
readlines
()
file
.
close
()
gen
=
generate_discardable_iterator
(
range
(
len
(
genus
)))
for
g
in
gen
:
genus
[
g
]
=
genus
[
g
].
rstrip
()
return
list
(
set
(
genus
))
def
get_dict_of_bacteria
(
gso_dict
,
list_of_bacteria
):
# input is a dictionary generated by read_tab_delimited_count_file with organism info
# output is a dictionary with bacteria only information
bacteria_dict
=
{}
non_bacteria_number_of_series
=
0
gso_dict_keys
=
generate_discardable_iterator
(
gso_dict
.
keys
())
for
organism
in
gso_dict_keys
:
if
organism
.
split
(
" "
)[
0
]
in
list_of_bacteria
:
bacteria_dict
[
organism
]
=
gso_dict
[
organism
]
else
:
non_bacteria_number_of_series
+=
gso_dict
[
organism
]
return
bacteria_dict
,
non_bacteria_number_of_series
def
get_most_represented_bacteria
(
bacteria_dict
):
# input is a dictionary of bacteria series count generated by get_dict_of_bacteria()
# output is a sorted list of tuples with the most represented bacteria
return
sorted
(
list
(
bacteria_dict
.
items
()),
key
=
lambda
tup
:
tup
[
1
],
reverse
=
True
)
def
merge_bacteria_strains
(
bacteria_dict
):
# input is a dictionary of bacteria series count generated by get_dict_of_bacteria()
# different strains of the same species are merged (as well as its counts)
# output is a dictionary structured similarly to the input
merged_bacteria_dict
=
{}
items
=
bacteria_dict
.
items
()
items_iter
=
generate_discardable_iterator
(
items
)
for
i
in
items_iter
:
genus_without_strain
=
" "
.
join
(
i
[
0
].
split
(
" "
)[:
2
])
if
genus_without_strain
not
in
merged_bacteria_dict
:
merged_bacteria_dict
[
genus_without_strain
]
=
i
[
1
]
else
:
merged_bacteria_dict
[
genus_without_strain
]
+=
i
[
1
]
return
merged_bacteria_dict
def
get_geo_data_over_years
(
filename
):
# input is the file with geo data over the years
# output is a sorted list of tuples with the formatted years and associated dataset count
data_years
=
[]
file
=
open
(
filename
,
"r"
)
data
=
file
.
readlines
()
file
.
close
()
data_iterator
=
generate_discardable_iterator
(
data
[
1
:])
quarters
=
{
"1"
:
0.2
,
"2"
:
0.4
,
"3"
:
0.6
,
"4"
:
0.8
}
for
stats
in
data_iterator
:
info
=
stats
.
split
(
"
\t
"
)
time
=
int
(
info
[
0
].
rstrip
())
+
quarters
[
info
[
1
].
rstrip
()]
data_years
.
append
((
time
,
int
(
info
[
2
].
replace
(
","
,
""
).
rstrip
())))
return
sorted
(
data_years
,
key
=
lambda
tup
:
tup
[
0
])
def
obtain_genus_from_species
(
species_dict
):
# input is a dictionary of bacteria series count generated by merge_bacteria_strains()
# output is a new dictionary with genus counts instead
new_genus_dict
=
{}
species
=
list
(
species_dict
.
keys
())
species_iter
=
generate_discardable_iterator
(
species
)
for
sp
in
species_iter
:
genus
=
sp
.
split
(
" "
)[
0
]
if
genus
in
new_genus_dict
:
new_genus_dict
[
genus
]
+=
species_dict
[
sp
]
else
:
new_genus_dict
[
genus
]
=
species_dict
[
sp
]
return
new_genus_dict
def
read_taxonomy
(
filename
):
# taxonomy file downloaded from
# https://www.itis.gov/hierarchy.html
# input is a file with taxonomy information downloaded from the previous url
# output is a list
# each element is a taxonomic classification
f
=
open
(
filename
,
"r"
)
file
=
f
.
readlines
()
f
.
close
()
file_iter
=
generate_discardable_iterator
(
file
)
taxa
=
[]
for
line
in
file_iter
:
taxa
.
append
(
line
.
strip
())
taxa
=
taxa
[
4
:]
return
taxa
def
get_taxa
(
taxa_file
,
taxa_to_count
=
"Family"
,
sub_taxa_to_count
=
"Genus"
):
# input is the taxa file generated by read_taxonomy
# taxa_to_count is the taxa by which we want to organize all sub taxas
# sub_taxa_to_count is the sub_taxa to be organized in each taxa
# output is a dictionary with taxonomic groups
# example:
# if input is taxa_file,"Family","Genus"
# the output is:
# {"Acidobacteria":["Bryobacter","Acidicapsa","Acidobacterium"......], "Family 2": [Genus X, Genus Y]}
collections
=
[]
taxa
=
{}
size
=
len
(
taxa_file
)
for
t
in
range
(
len
(
taxa_file
)):
if
taxa_to_count
in
taxa_file
[
t
]:
group
=
[
taxa_file
[
t
]]
for
i
in
range
(
t
+
1
,
len
(
taxa_file
)):
if
taxa_to_count
in
taxa_file
[
i
]
or
i
+
1
==
size
:
if
i
+
1
==
size
:
group
.
append
(
taxa_file
[
i
])
collections
.
append
(
group
)
break
else
:
group
.
append
(
taxa_file
[
i
])
collections_iter
=
generate_discardable_iterator
(
collections
)
for
c
in
collections_iter
:
sub_taxa
=
[]
for
tax
in
c
:
if
sub_taxa_to_count
+
": "
in
tax
:
sub_taxa
.
append
(
tax
.
split
(
":"
)[
1
].
strip
())
taxa
[
c
[
0
].
split
(
":"
)[
1
].
strip
()]
=
sub_taxa
return
taxa
def
get_counts_by_taxa
(
bacteria_counts
,
taxonomy
):
# input is the dictionary with bacterial counts and
# the taxonomy dictionary
# output is a dictionary with the number of series for each
# bacterial selected taxa (such as phyla or family)
res
=
{}
for
tup
in
bacteria_counts
.
items
():
for
tax_tup
in
taxonomy
.
items
():
taxa
=
tup
[
0
]
taxa_count
=
tup
[
1
]
superior_taxa
=
tax_tup
[
0
]
list_of_sub_taxas
=
tax_tup
[
1
]
if
taxa
in
list_of_sub_taxas
:
if
superior_taxa
in
res
:
res
[
superior_taxa
]
+=
taxa_count
else
:
res
[
superior_taxa
]
=
taxa_count
return
res
def
find_most_representable_species_of_each_family
(
family_to_genus_dict
,
species_dict
):
# output is the species with the most series for each family of bacteria and its respective number of series
res
=
{}
for
species_count_tup
in
species_dict
.
items
():
specie
=
species_count_tup
[
0
]
genus
=
species_count_tup
[
0
].
split
(
" "
)[
0
]
species_count
=
species_count_tup
[
1
]
for
fam_to_gen_tup
in
family_to_genus_dict
.
items
():
if
genus
in
fam_to_gen_tup
[
1
]:
fam
=
fam_to_gen_tup
[
0
]
if
fam
not
in
res
:
res
[
fam
]
=
(
specie
,
species_count
)
break
else
:
if
species_count
>
res
[
fam
][
1
]:
res
[
fam
]
=
(
specie
,
species_count
)
break
return
res
def
read_ArrayExpress_file
(
AE_filename
):
# parses the result of an array_express query
# returns a dictionary with data regarding
# bacterial species and the number of series with data for each species
file
=
open
(
AE_filename
,
"r"
)
AE_bacteria
=
[]
for
line
in
file
.
readlines
():
AE_bacteria
.
append
(
line
.
split
(
"
\t
"
)[
3
])
file
.
close
()
AE_bacteria
=
AE_bacteria
[
1
:]
res
=
{}
for
bacteria
in
AE_bacteria
:
if
bacteria
not
in
res
:
res
[
bacteria
]
=
AE_bacteria
.
count
(
bacteria
)
return
res
##### GEO
### Gene expression studies
# Type of expression studies on GEO
geo_series_type
=
read_tab_delimited_count_file
(
"geo_series_type.txt"
)
# Expression studies grouped by categories
gst_categories
=
group_counts_by_category
(
geo_series_type
)
# Expression studies grouped by categories divided by use of high throughput sequencing
gst_categories_with_hst
=
group_counts_by_category
(
geo_series_type
,
hst
=
True
)
geo_hst
=
[]
geo_microarray
=
[]
for
i
in
gst_categories_with_hst
.
values
():
geo_microarray
.
append
(
i
[
0
])
geo_hst
.
append
(
i
[
1
])
# Usage of high throughput sequencing for expression studies
gst_hst
=
group_data_by_high_throughput_sequencing
(
geo_series_type
)
### Organism studies
# Number of series with data for each species on GEO
geo_series_organisms
=
read_tab_delimited_count_file
(
"geo_series_organisms.txt"
)
# A list of known bacteria species retrieved elsewhere than GEO.
# Its purpose is for finding which organisms on GEO are bacterial
list_bacteria
=
get_list_of_bacteria_genus
(
"taxonomy.txt"
)
# Intermediate processing
intermediate_data
=
get_dict_of_bacteria
(
geo_series_organisms
,
list_bacteria
)
# Number of series with data for each species (including different strains) of bacteria
bacteria_dict
=
intermediate_data
[
0
]
# Number of series with data for organisms that are not bacterial
number_of_non_bacteria_series
=
intermediate_data
[
1
]
# Number of series with data for bacterial organisms
number_of_bacteria_series
=
sum
(
bacteria_dict
.
values
())
# Proportion of number of series with bacterial data when compared to non bacterial
bacteria_proportion
=
number_of_bacteria_series
/
number_of_non_bacteria_series
*
100
# Bacteria species (including different strains) and sorted by its number of series
most_represented_bacteria
=
get_most_represented_bacteria
(
bacteria_dict
)
# Number of series with data for each species of bacteria
# Different strains are merged in the same specie
merged_bacteria
=
merge_bacteria_strains
(
bacteria_dict
)
# Bacteria species sorted by its number of series
# Different strains are merged in the same specie
most_represented_merged_bacteria
=
get_most_represented_bacteria
(
merged_bacteria
)
# Top 10 most represented bacteria species on GEO
top10_bacteria
=
most_represented_merged_bacteria
[:
10
]
# Evolution of number of series on GEO from 2012 to 2019
geo_data
=
get_geo_data_over_years
(
"geo_data_over_years.txt"
)
years
=
[]
number_of_series
=
[]
for
i
in
geo_data
:
years
.
append
(
i
[
0
])
number_of_series
.
append
(
i
[
1
])
# Number of bacteria species for which data is available at GEO
number_of_analysed_bacterial_genomes
=
len
(
most_represented_bacteria
)
### Taxonomy study
# Taxonomy database retrieved from a text file
taxonomy
=
(
read_taxonomy
(
"Bacteria_Genus.txt"
))
# Bacteria available in the taxonomy database grouped by Family and Phylum
bacteria_family
=
get_taxa
(
taxonomy
,
"Family"
)
bacteria_phylum
=
get_taxa
(
taxonomy
,
"Phylum"
)
# Number of series with data for each genus of bacteria
merged_bacteria_by_genus
=
obtain_genus_from_species
(
merged_bacteria
)
# Number of series with data for each family or phylum of bacteria
taxa_counts_family
=
get_counts_by_taxa
(
merged_bacteria_by_genus
,
bacteria_family
)
taxa_counts_phylum
=
get_counts_by_taxa
(
merged_bacteria_by_genus
,
bacteria_phylum
)
# Number of different families and phyla available on GEO
#print("O GEO tem dados relativos a",len(taxa_counts_family.keys()),"familias diferentes")
#print("O GEO tem dados relativos a",len(taxa_counts_phylum.keys()),"filos diferentes")
# List of phyla available in GEO (for which there are series with data)
list_of_phyla_in_geo
=
list
(
taxa_counts_phylum
.
keys
())
# Most representable specie of each family available at GEO and its number of series
most_representable_species_of_each_family
=
find_most_representable_species_of_each_family
(
bacteria_family
,
merged_bacteria
)
data
=
list
(
most_representable_species_of_each_family
.
items
())
final_data
=
sorted
(
data
,
key
=
lambda
tup
:
tup
[
1
][
1
],
reverse
=
True
)[:
21
]
# Phylogenetic tree: contextualization
# first we found the most representative species of each family
# then, we sorted that list and found the top 20 families with most representative species
# our goal on this stage is to design a phylogenetic tree with these families
# the most appropriate data for designing phylogenetic trees is 16S ribosomal RNA
# Using NCBI, we manually collected 16S ribosomal RNA FASTa sequences for the top 20 species filtered beforehand
# These sequences were stored in a file
# Then we used EBI's Clustal Omega's web tool to build a phylogenetic tree from this data, using default parameters
# The tree was further refined with relevant information such as color coding by phyla, association of family
# with species and the respective number of series on GEO
##### ArrayExpress
### Gene expression studies
# Expression studies on ArrayExpress grouped by categories
# Data available only on ArrayExpress (not other EBI databases)
# Expression studies grouped by categories (number of experiments) on ArrayExpress
AE_gene_expression_studies_NGS
=
{
"RNA-Seq"
:
2344
,
"ChIP-seq"
:
432
,
"GRO-seq"
:
7
,
"NET-seq"
:
1
,
"Ribo-seq"
:
2
,
"Hi-C"
:
9
,
"BS-seq"
:
2
,
"ATAC-Seq"
:
20
,
"RIP-Seq"
:
26
}
AE_gene_expression_studies_Microarray
=
{
"Transcription profiling"
:
8150
,
"Comparative genomic hybridization by array"
:
417
,
"Methylation profiling"
:
150
,
"Proteomic profiling by array"
:
22
,
"Genotyping by array"
:
151
,
"Other"
:
644
}
AE_ngs_microarray
=
{
"RNA-Seq"
:
0
,
"ChIP-seq"
:
0
,
"Transcription profiling"
:
0
,
"Comparative genomic hybridization by array"
:
0
,
"Other"
:
0
}
other_ngs
=
0
other_microarray
=
644
for
k
in
AE_gene_expression_studies_NGS
:
if
k
in
AE_ngs_microarray
:
AE_ngs_microarray
[
k
]
+=
AE_gene_expression_studies_NGS
[
k
]
else
:
other_ngs
+=
AE_gene_expression_studies_NGS
[
k
]
for
k2
in
AE_gene_expression_studies_Microarray
:
if
k2
in
AE_ngs_microarray
:
AE_ngs_microarray
[
k2
]
+=
AE_gene_expression_studies_Microarray
[
k2
]
else
:
other_microarray
+=
AE_gene_expression_studies_Microarray
[
k2
]
# Usage of high throughput sequencing for expression studies on ArrayExpress
AE_number_of_NGS_GE_studies
=
sum
(
AE_gene_expression_studies_NGS
.
values
())
AE_number_of_Microarray_studies
=
sum
(
AE_gene_expression_studies_Microarray
.
values
())
# Number of series with data for each species (including different strains) of bacteria at ArrayExpress
AE_supposed_bacterial_species_count
=
read_ArrayExpress_file
(
"ArrayExpress-Experiments-Bacteria.txt"
)
AE_bacterial_species_count
=
get_dict_of_bacteria
(
AE_supposed_bacterial_species_count
,
list_bacteria
)[
0
]
# Number of series with data for each species of bacteria on ArrayExpress
# Different strains are merged in the same specie
AE_merged_bacteria
=
merge_bacteria_strains
(
AE_bacterial_species_count
)
# Number of series with data for bacterial organisms on ArrayExpress
AE_bacterial_data
=
sum
(
AE_merged_bacteria
.
values
())
# Number of series with data for organisms that are not bacterial on ArrayExpress
# 12375 is the total number of series exclusively on ArrayExpress (Filtered by AE only on the platform's search engine)
AE_non_bacterial_data
=
12375
-
AE_bacterial_data
# Proportion of number of series with bacterial data when compared to non bacterial on ArrayExpress
AE_bacterial_proportion
=
round
(
AE_bacterial_data
/
AE_non_bacterial_data
*
100
)
# Bacteria species (including different strains) and sorted by its number of series on ArrayExpress
most_represented_AE_bacteria
=
get_most_represented_bacteria
(
AE_merged_bacteria
)
# Top 10 most represented bacteria species on ArrayExpress
top_10_AE_bacteria
=
most_represented_AE_bacteria
[:
10
]
# Number of bacteria species for which data is available at ArrayExpress
AE_number_of_analysed_bacterial_genomes
=
len
(
most_represented_AE_bacteria
)
### PROCESSING IS COMPLETE
### DATA IS COLLECTED AND READY TO BE VISUALIZED
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment