Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
João Sequeira
MOSCA
Commits
ea365ef6
Commit
ea365ef6
authored
Mar 02, 2021
by
iquasere
Browse files
Now reads memory in Gb
Fix on general report columns
parent
3fff9cc4
Changes
5
Hide whitespace changes
Inline
Side-by-side
resources/reporter_columns.txt
View file @
ea365ef6
...
...
@@ -11,7 +11,7 @@
[Initial quality assessment] Overrepresented sequences
[Initial quality assessment] Adapter Content
[Adapter removal] adapter files
[
Adapter removal
] # of reads
[
Before quality trimming
] # of reads
[Before quality trimming] Per base sequence quality
[Before quality trimming] Per base sequence quality
[Before quality trimming] Per tile sequence quality
...
...
@@ -23,8 +23,8 @@
[Before quality trimming] Sequence Duplication Levels
[Before quality trimming] Overrepresented sequences
[Before quality trimming] Adapter Content
[Quality trimming] # of reads
[Quality trimming] Parameters
[After quality trimming] # of reads
[After quality trimming] Per base sequence quality
[After quality trimming] Per base sequence quality
[After quality trimming] Per tile sequence quality
...
...
workflow/scripts/annotation.py
View file @
ea365ef6
...
...
@@ -15,6 +15,7 @@ import os
import
pandas
as
pd
from
mosca_tools
import
run_command
from
progressbar
import
ProgressBar
import
psutil
class
Annotater
:
...
...
@@ -43,6 +44,10 @@ class Annotater:
parser
.
add_argument
(
"-mts"
,
"--max-target-seqs"
,
type
=
str
,
help
=
"Number of identifications for each protein"
)
parser
.
add_argument
(
"--download-uniprot"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Download uniprot database if FASTA DB doesn't exist"
)
parser
.
add_argument
(
"-b"
,
"--block-size"
,
default
=
None
,
help
=
"Number of annotations to output per sequence inputed"
)
parser
.
add_argument
(
"-c"
,
"--index-chunks"
,
default
=
None
,
help
=
"Number of annotations to output per sequence inputed"
)
args
=
parser
.
parse_args
()
...
...
@@ -81,8 +86,22 @@ class Annotater:
def
generate_diamond_database
(
self
,
fasta
,
dmnd
):
run_command
(
'diamond makedb --in {} -d {}'
.
format
(
fasta
,
dmnd
))
def
run_diamond
(
self
,
query
,
aligned
,
unaligned
,
database
,
threads
=
'12'
,
max_target_seqs
=
'50'
):
def
b_n_c
(
self
,
argsb
,
argsc
):
if
argsb
is
not
None
:
b
=
argsb
else
:
b
=
psutil
.
virtual_memory
().
available
/
(
1024.0
**
3
)
/
20
# b = memory in Gb / 20
if
argsc
is
not
None
:
return
b
,
argsc
if
b
>
3
:
return
b
,
1
if
b
>
2
:
return
b
,
2
if
b
>
1
:
return
b
,
3
return
b
,
4
def
run_diamond
(
self
,
query
,
aligned
,
unaligned
,
database
,
threads
=
'12'
,
max_target_seqs
=
'50'
,
b
=
None
,
c
=
None
):
if
database
[
-
6
:]
==
'.fasta'
or
database
[
-
4
:]
==
'.faa'
:
print
(
'FASTA database was inputed'
)
if
not
os
.
path
.
isfile
(
database
.
replace
(
'fasta'
,
'dmnd'
)):
...
...
@@ -102,8 +121,8 @@ class Annotater:
database
=
database
.
split
(
'.dmnd'
)[
0
]
run_command
(
"diamond blastp --query {} --out {} --un {} --db {} --outfmt 6 --unal 1 --threads {} --max-target-seqs {}"
.
format
(
query
,
aligned
,
unaligned
,
database
,
threads
,
max_target_seqs
))
"diamond blastp --query {} --out {} --un {} --db {} --outfmt 6 --unal 1 --threads {} --max-target-seqs {}
"
"-b {} -c {}"
.
format
(
query
,
aligned
,
unaligned
,
database
,
threads
,
max_target_seqs
,
b
,
c
))
def
run
(
self
):
args
=
self
.
get_arguments
()
...
...
@@ -115,11 +134,11 @@ class Annotater:
self
.
download_uniprot
(
'/'
.
join
(
args
.
database
.
split
(
'/'
)[:
-
1
]))
args
.
database
=
'{}/uniprot.fasta'
.
format
(
'/'
.
join
(
args
.
database
.
split
(
'/'
)[:
-
1
]))
# TODO - annotation has to be refined to retrieve better than hypothetical proteins
(
b
,
c
)
=
self
.
b_n_c
(
argsb
=
args
.
block_size
,
argsc
=
args
.
index_chunks
)
self
.
run_diamond
(
'{}/fgs.faa'
.
format
(
args
.
output
),
'{}/aligned.blast'
.
format
(
args
.
output
),
'{}/unaligned.fasta'
.
format
(
args
.
output
),
args
.
database
,
threads
=
args
.
threads
,
max_target_seqs
=
args
.
max_target_seqs
)
# TODO - annotation has to be refined to retrieve better than hypothetical proteins
'{}/unaligned.fasta'
.
format
(
args
.
output
),
args
.
database
,
threads
=
args
.
threads
,
max_target_seqs
=
args
.
max_target_seqs
,
b
=
b
,
c
=
c
)
def
further_annotation
(
self
,
data
,
temp_folder
=
'temp'
,
dmnd_db
=
'MOSCA/Databases/annotation_databases/uniprot.dmnd'
,
...
...
@@ -171,8 +190,8 @@ class Annotater:
else
:
if
len
(
partial
)
>
0
:
original_taxonomy
=
data
.
loc
[
data
[
'Entry'
]
==
query
].
iloc
[
0
][
tax_columns
]
max_proximity
=
0
;
entry
=
np
.
nan
;
max_proximity
=
0
entry
=
np
.
nan
protein_name
=
np
.
nan
for
i
in
range
(
len
(
partial
)):
tax_score
=
self
.
score_taxonomic_proximity
(
original_taxonomy
,
...
...
workflow/scripts/assembly.py
View file @
ea365ef6
...
...
@@ -13,6 +13,7 @@ import os
import
pathlib
import
shutil
from
mosca_tools
import
run_command
,
run_pipe_command
,
perform_alignment
import
psutil
class
Assembler
:
...
...
@@ -31,18 +32,16 @@ class Assembler:
help
=
"Number of threads to use. Default is number of CPUs available minus 2."
)
parser
.
add_argument
(
"-a"
,
"--assembler"
,
type
=
str
,
choices
=
[
"metaspades"
,
"megahit"
],
help
=
"Tool for assembling the reads"
,
default
=
"metaspades"
)
parser
.
add_argument
(
"-m"
,
"--memory"
,
default
=
None
,
type
=
float
,
help
=
"Maximum memory (Mb) available for assembly tools."
)
# default memory is a third of total available memory
parser
.
add_argument
(
"-m"
,
"--memory"
,
default
=
psutil
.
virtual_memory
().
available
/
(
1024.0
**
3
)
/
3
,
type
=
float
,
help
=
"Maximum memory (Gb) available for assembly tools."
)
args
=
parser
.
parse_args
()
args
.
output
=
args
.
output
.
rstrip
(
'/'
)
args
.
reads
=
args
.
reads
.
split
(
','
)
print
(
type
(
args
.
memory
))
if
hasattr
(
args
,
'memory'
):
if
args
.
assembler
==
'megahit'
:
# Megahit reads max memory in byte
args
.
memory
*=
1048576
else
:
# metaspades reads max memory in Gb
args
.
memory
*=
0.0009765625
if
args
.
assembler
==
'megahit'
:
# Megahit reads max memory in byte
args
.
memory
*=
10e9
return
args
...
...
workflow/scripts/preprocess.py
View file @
ea365ef6
...
...
@@ -16,7 +16,7 @@ import os
import
sys
import
pathlib
import
time
from
mosca_tools
import
run_command
,
run_pipe_command
,
parse_fastqc
from
mosca_tools
import
run_command
,
run_pipe_command
,
parse_fastqc
_report
class
Preprocesser
:
...
...
@@ -57,7 +57,7 @@ class Preprocesser:
' --extract '
if
extract
else
' '
,
' '
.
join
(
files
)))
def
has_adapters
(
self
,
fastqc_report
):
data
=
parse_fastqc
(
fastqc_report
)
data
=
parse_fastqc
_report
(
fastqc_report
)
if
not
data
[
'Adapter Content'
][
0
]
==
'pass'
:
return
True
terms_list
=
[
...
...
@@ -262,7 +262,7 @@ class Preprocesser:
crop
=
float
(
'inf'
)
headcrop
=
0
for
report
in
fastqc_reports
:
data
=
parse_fastqc
(
report
)
data
=
parse_fastqc
_report
(
report
)
if
data
[
'Per base sequence quality'
][
0
]
in
[
'warn'
,
'fail'
]:
parameter
=
self
.
get_crop
(
data
)
if
parameter
<
crop
:
...
...
workflow/scripts/report.py
View file @
ea365ef6
...
...
@@ -61,12 +61,12 @@ class Reporter:
named 'name', and the columns that start with 'prefix'
'''
def
info_from_fastqc
(
self
,
output_dir
,
name
,
prefix
,
prefix2terms
,
fastq_columns
):
def
info_from_fastqc
(
self
,
output_dir
,
name
,
col_name
,
prefix
,
prefix2terms
,
fastq_columns
):
print
(
prefix
)
reports
=
[
parse_fastqc_report
(
'{}/Preprocess/FastQC/{}{}_{}_fastqc/fastqc_data.txt'
.
format
(
output_dir
,
prefix2terms
[
prefix
][
0
],
name
,
prefix2terms
[
prefix
][
i
]))
for
i
in
[
1
,
2
]]
self
.
report
.
loc
[
name
,
'{} # of reads'
.
format
(
prefix
)]
=
reports
[
0
][
'Basic Statistics'
][
1
].
loc
[
self
.
report
.
loc
[
col_
name
,
'{} # of reads'
.
format
(
prefix
)]
=
reports
[
0
][
'Basic Statistics'
][
1
].
loc
[
'Total Sequences'
][
'Value'
]
for
column
in
fastq_columns
:
...
...
@@ -75,9 +75,9 @@ class Reporter:
reports
[
i
][
column
]
=
(
'Not available'
,
None
)
# only the not available matters. And nothing else matters!...
if
reports
[
0
][
column
][
0
]
==
reports
[
1
][
column
][
0
]:
self
.
report
.
loc
[
name
,
'{} {}'
.
format
(
prefix
,
column
)]
=
reports
[
0
][
column
][
0
]
self
.
report
.
loc
[
col_
name
,
'{} {}'
.
format
(
prefix
,
column
)]
=
reports
[
0
][
column
][
0
]
else
:
self
.
report
.
loc
[
name
,
'{} {}'
.
format
(
prefix
,
column
)]
=
(
self
.
report
.
loc
[
col_
name
,
'{} {}'
.
format
(
prefix
,
column
)]
=
(
'{} (forward) {} (reverse)'
.
format
(
reports
[
0
][
column
][
0
],
reports
[
1
][
column
][
0
]))
...
...
@@ -107,7 +107,7 @@ class Reporter:
else
:
# is single-end
ends_name
=
input_file
.
split
(
'/'
)[
-
1
].
split
(
'.f'
)[
0
]
self
.
info_from_fastqc
(
output_dir
,
ends_name
,
'[Initial quality assessment]'
,
prefix2terms
,
fastq_columns
)
self
.
info_from_fastqc
(
output_dir
,
ends_name
,
name
,
'[Initial quality assessment]'
,
prefix2terms
,
fastq_columns
)
# After adapter removal
try
:
...
...
@@ -125,20 +125,20 @@ class Reporter:
else
:
# still using original files, no rRNA or adapter removal happened
right_name
=
ends_name
self
.
info_from_fastqc
(
output_dir
,
right_name
,
'[Before quality trimming]'
,
prefix2terms
,
fastq_columns
)
self
.
info_from_fastqc
(
output_dir
,
right_name
,
name
,
'[Before quality trimming]'
,
prefix2terms
,
fastq_columns
)
self
.
report
.
loc
[
name
,
'[Quality trimming] Parameters'
]
=
'; '
.
join
([
file
for
file
in
set
(
open
(
'{}/Preprocess/Trimmomatic/{}_quality_params.txt'
.
format
(
output_dir
,
name
)).
read
(
).
split
(
'
\n
'
))
if
len
(
file
)
>
2
])
# TODO - because '' must be interfering, try to cut the problem at the root before troubles
self
.
info_from_fastqc
(
output_dir
,
name
,
'[After quality trimming]'
,
prefix2terms
,
fastq_columns
)
self
.
info_from_fastqc
(
output_dir
,
name
,
name
,
'[After quality trimming]'
,
prefix2terms
,
fastq_columns
)
def
set_samples
(
self
,
experiments
):
experiments
=
experiments
[[
'Name'
,
'Sample'
]]
self
.
report
=
pd
.
merge
(
experiments
,
self
.
report
,
left_on
=
'Name'
,
right_index
=
True
,
how
=
'outer'
)
self
.
report
=
pd
.
merge
(
experiments
[[
'Name'
,
'Sample'
]]
,
self
.
report
,
left_on
=
'Name'
,
right_index
=
True
,
how
=
'outer'
)
def
info_from_assembly
(
self
,
output_dir
,
sample
):
print
(
'Retrieving assembly information for sample '
+
sample
)
print
(
'Retrieving assembly information for sample
:
'
+
sample
)
qc_report
=
pd
.
read_csv
(
'{}/Assembly/{}/quality_control/report.tsv'
.
format
(
output_dir
,
sample
),
sep
=
'
\t
'
,
index_col
=
0
).
transpose
()
qc_report
.
index
=
[
sample
]
...
...
@@ -147,7 +147,7 @@ class Reporter:
self
.
report
.
loc
[
self
.
report
[
'Sample'
]
==
sample
,
'[Assembly] '
+
col
]
=
qc_report
.
loc
[
sample
][
col
]
def
info_from_annotation
(
self
,
output_dir
,
sample
):
print
(
'Retrieving annotation information for sample '
+
sample
)
print
(
'Retrieving annotation information for sample
:
'
+
sample
)
sample_report
=
dict
()
sample_report
[
'# of proteins detected'
]
=
(
count_on_file
(
'>'
,
'{}/Annotation/{}/fgs.faa'
.
format
(
output_dir
,
sample
)))
...
...
@@ -170,6 +170,7 @@ class Reporter:
sample_report
.
loc
[
sample
][
col
])
def
info_from_binning
(
self
,
output_dir
,
sample
):
print
(
'Retrieving binning information for sample: '
+
sample
)
sample_report
=
dict
()
sample_report
[
'# of bins'
]
=
len
(
glob
.
glob
(
'{0}/Binning/{1}/{1}.*.fasta'
.
format
(
output_dir
,
sample
)))
...
...
@@ -242,7 +243,7 @@ class Reporter:
for
mt_name
in
exps
[
exps
[
"Data type"
]
==
'mrna'
][
'Name'
]:
self
.
info_from_alignment
(
args
.
output
,
mt_name
)
self
.
report
.
to_excel
(
'{}/MOSCA_General_Report.xlsx'
.
format
(
args
.
output
))
self
.
report
.
to_excel
(
'{}/MOSCA_General_Report.xlsx'
.
format
(
args
.
output
)
,
index
=
False
)
self
.
zip_files
([
'{}/{}'
.
format
(
args
.
output
,
filename
)
for
filename
in
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment