Commit ea365ef6 authored by iquasere's avatar iquasere
Browse files

Now reads memory in Gb

Fix on general report columns
parent 3fff9cc4
......@@ -11,7 +11,7 @@
[Initial quality assessment] Overrepresented sequences
[Initial quality assessment] Adapter Content
[Adapter removal] adapter files
[Adapter removal] # of reads
[Before quality trimming] # of reads
[Before quality trimming] Per base sequence quality
[Before quality trimming] Per base sequence quality
[Before quality trimming] Per tile sequence quality
......@@ -23,8 +23,8 @@
[Before quality trimming] Sequence Duplication Levels
[Before quality trimming] Overrepresented sequences
[Before quality trimming] Adapter Content
[Quality trimming] # of reads
[Quality trimming] Parameters
[After quality trimming] # of reads
[After quality trimming] Per base sequence quality
[After quality trimming] Per base sequence quality
[After quality trimming] Per tile sequence quality
......
......@@ -15,6 +15,7 @@ import os
import pandas as pd
from mosca_tools import run_command
from progressbar import ProgressBar
import psutil
class Annotater:
......@@ -43,6 +44,10 @@ class Annotater:
parser.add_argument("-mts", "--max-target-seqs", type=str, help="Number of identifications for each protein")
parser.add_argument("--download-uniprot", action="store_true", default=False,
help="Download uniprot database if FASTA DB doesn't exist")
parser.add_argument("-b", "--block-size", default=None,
help="Number of annotations to output per sequence inputed")
parser.add_argument("-c", "--index-chunks", default=None,
help="Number of annotations to output per sequence inputed")
args = parser.parse_args()
......@@ -81,8 +86,22 @@ class Annotater:
def generate_diamond_database(self, fasta, dmnd):
run_command('diamond makedb --in {} -d {}'.format(fasta, dmnd))
def run_diamond(self, query, aligned, unaligned, database, threads='12',
max_target_seqs='50'):
def b_n_c(self, argsb, argsc):
if argsb is not None:
b = argsb
else:
b = psutil.virtual_memory().available / (1024.0 ** 3) / 20 # b = memory in Gb / 20
if argsc is not None:
return b, argsc
if b > 3:
return b, 1
if b > 2:
return b, 2
if b > 1:
return b, 3
return b, 4
def run_diamond(self, query, aligned, unaligned, database, threads='12', max_target_seqs='50', b=None, c=None):
if database[-6:] == '.fasta' or database[-4:] == '.faa':
print('FASTA database was inputed')
if not os.path.isfile(database.replace('fasta', 'dmnd')):
......@@ -102,8 +121,8 @@ class Annotater:
database = database.split('.dmnd')[0]
run_command(
"diamond blastp --query {} --out {} --un {} --db {} --outfmt 6 --unal 1 --threads {} --max-target-seqs {}".format(
query, aligned, unaligned, database, threads, max_target_seqs))
"diamond blastp --query {} --out {} --un {} --db {} --outfmt 6 --unal 1 --threads {} --max-target-seqs {} "
"-b {} -c {}".format(query, aligned, unaligned, database, threads, max_target_seqs, b, c))
def run(self):
args = self.get_arguments()
......@@ -115,11 +134,11 @@ class Annotater:
self.download_uniprot('/'.join(args.database.split('/')[:-1]))
args.database = '{}/uniprot.fasta'.format('/'.join(args.database.split('/')[:-1]))
# TODO - annotation has to be refined to retrieve better than hypothetical proteins
(b, c) = self.b_n_c(argsb=args.block_size, argsc=args.index_chunks)
self.run_diamond('{}/fgs.faa'.format(args.output), '{}/aligned.blast'.format(args.output),
'{}/unaligned.fasta'.format(args.output), args.database,
threads=args.threads,
max_target_seqs=args.max_target_seqs) # TODO - annotation has to be refined to retrieve better than hypothetical proteins
'{}/unaligned.fasta'.format(args.output), args.database, threads=args.threads,
max_target_seqs=args.max_target_seqs, b=b, c=c)
def further_annotation(self, data, temp_folder='temp',
dmnd_db='MOSCA/Databases/annotation_databases/uniprot.dmnd',
......@@ -171,8 +190,8 @@ class Annotater:
else:
if len(partial) > 0:
original_taxonomy = data.loc[data['Entry'] == query].iloc[0][tax_columns]
max_proximity = 0;
entry = np.nan;
max_proximity = 0
entry = np.nan
protein_name = np.nan
for i in range(len(partial)):
tax_score = self.score_taxonomic_proximity(original_taxonomy,
......
......@@ -13,6 +13,7 @@ import os
import pathlib
import shutil
from mosca_tools import run_command, run_pipe_command, perform_alignment
import psutil
class Assembler:
......@@ -31,18 +32,16 @@ class Assembler:
help="Number of threads to use. Default is number of CPUs available minus 2.")
parser.add_argument("-a", "--assembler", type=str, choices=["metaspades", "megahit"],
help="Tool for assembling the reads", default="metaspades")
parser.add_argument("-m", "--memory", default=None, type=float,
help="Maximum memory (Mb) available for assembly tools.")
# default memory is a third of total available memory
parser.add_argument("-m", "--memory", default=psutil.virtual_memory().available / (1024.0 ** 3) / 3, type=float,
help="Maximum memory (Gb) available for assembly tools.")
args = parser.parse_args()
args.output = args.output.rstrip('/')
args.reads = args.reads.split(',')
print(type(args.memory))
if hasattr(args, 'memory'):
if args.assembler == 'megahit': # Megahit reads max memory in byte
args.memory *= 1048576
else: # metaspades reads max memory in Gb
args.memory *= 0.0009765625
if args.assembler == 'megahit': # Megahit reads max memory in byte
args.memory *= 10e9
return args
......
......@@ -16,7 +16,7 @@ import os
import sys
import pathlib
import time
from mosca_tools import run_command, run_pipe_command, parse_fastqc
from mosca_tools import run_command, run_pipe_command, parse_fastqc_report
class Preprocesser:
......@@ -57,7 +57,7 @@ class Preprocesser:
' --extract ' if extract else ' ', ' '.join(files)))
def has_adapters(self, fastqc_report):
data = parse_fastqc(fastqc_report)
data = parse_fastqc_report(fastqc_report)
if not data['Adapter Content'][0] == 'pass':
return True
terms_list = [
......@@ -262,7 +262,7 @@ class Preprocesser:
crop = float('inf')
headcrop = 0
for report in fastqc_reports:
data = parse_fastqc(report)
data = parse_fastqc_report(report)
if data['Per base sequence quality'][0] in ['warn', 'fail']:
parameter = self.get_crop(data)
if parameter < crop:
......
......@@ -61,12 +61,12 @@ class Reporter:
named 'name', and the columns that start with 'prefix'
'''
def info_from_fastqc(self, output_dir, name, prefix, prefix2terms, fastq_columns):
def info_from_fastqc(self, output_dir, name, col_name, prefix, prefix2terms, fastq_columns):
print(prefix)
reports = [parse_fastqc_report('{}/Preprocess/FastQC/{}{}_{}_fastqc/fastqc_data.txt'.format(
output_dir, prefix2terms[prefix][0], name, prefix2terms[prefix][i])) for i in [1, 2]]
self.report.loc[name, '{} # of reads'.format(prefix)] = reports[0]['Basic Statistics'][1].loc[
self.report.loc[col_name, '{} # of reads'.format(prefix)] = reports[0]['Basic Statistics'][1].loc[
'Total Sequences']['Value']
for column in fastq_columns:
......@@ -75,9 +75,9 @@ class Reporter:
reports[i][column] = (
'Not available', None) # only the not available matters. And nothing else matters!...
if reports[0][column][0] == reports[1][column][0]:
self.report.loc[name, '{} {}'.format(prefix, column)] = reports[0][column][0]
self.report.loc[col_name, '{} {}'.format(prefix, column)] = reports[0][column][0]
else:
self.report.loc[name, '{} {}'.format(prefix, column)] = (
self.report.loc[col_name, '{} {}'.format(prefix, column)] = (
'{} (forward) {} (reverse)'.format(
reports[0][column][0], reports[1][column][0]))
......@@ -107,7 +107,7 @@ class Reporter:
else: # is single-end
ends_name = input_file.split('/')[-1].split('.f')[0]
self.info_from_fastqc(output_dir, ends_name, '[Initial quality assessment]', prefix2terms, fastq_columns)
self.info_from_fastqc(output_dir, ends_name, name, '[Initial quality assessment]', prefix2terms, fastq_columns)
# After adapter removal
try:
......@@ -125,20 +125,20 @@ class Reporter:
else: # still using original files, no rRNA or adapter removal happened
right_name = ends_name
self.info_from_fastqc(output_dir, right_name, '[Before quality trimming]', prefix2terms, fastq_columns)
self.info_from_fastqc(output_dir, right_name, name, '[Before quality trimming]', prefix2terms, fastq_columns)
self.report.loc[name, '[Quality trimming] Parameters'] = '; '.join([
file for file in set(open('{}/Preprocess/Trimmomatic/{}_quality_params.txt'.format(output_dir, name)).read(
).split('\n')) if len(file) > 2]) # TODO - because '' must be interfering, try to cut the problem at the root before troubles
self.info_from_fastqc(output_dir, name, '[After quality trimming]', prefix2terms, fastq_columns)
self.info_from_fastqc(output_dir, name, name, '[After quality trimming]', prefix2terms, fastq_columns)
def set_samples(self, experiments):
experiments = experiments[['Name', 'Sample']]
self.report = pd.merge(experiments, self.report, left_on='Name', right_index=True, how='outer')
self.report = pd.merge(experiments[['Name', 'Sample']], self.report, left_on='Name', right_index=True,
how='outer')
def info_from_assembly(self, output_dir, sample):
print('Retrieving assembly information for sample ' + sample)
print('Retrieving assembly information for sample: ' + sample)
qc_report = pd.read_csv('{}/Assembly/{}/quality_control/report.tsv'.format(
output_dir, sample), sep='\t', index_col=0).transpose()
qc_report.index = [sample]
......@@ -147,7 +147,7 @@ class Reporter:
self.report.loc[self.report['Sample'] == sample, '[Assembly] ' + col] = qc_report.loc[sample][col]
def info_from_annotation(self, output_dir, sample):
print('Retrieving annotation information for sample ' + sample)
print('Retrieving annotation information for sample: ' + sample)
sample_report = dict()
sample_report['# of proteins detected'] = (
count_on_file('>', '{}/Annotation/{}/fgs.faa'.format(output_dir, sample)))
......@@ -170,6 +170,7 @@ class Reporter:
sample_report.loc[sample][col])
def info_from_binning(self, output_dir, sample):
print('Retrieving binning information for sample: ' + sample)
sample_report = dict()
sample_report['# of bins'] = len(glob.glob(
'{0}/Binning/{1}/{1}.*.fasta'.format(output_dir, sample)))
......@@ -242,7 +243,7 @@ class Reporter:
for mt_name in exps[exps["Data type"] == 'mrna']['Name']:
self.info_from_alignment(args.output, mt_name)
self.report.to_excel('{}/MOSCA_General_Report.xlsx'.format(args.output))
self.report.to_excel('{}/MOSCA_General_Report.xlsx'.format(args.output), index=False)
self.zip_files([
'{}/{}'.format(args.output, filename) for filename in [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment