Commit f8e7f2eb authored by iquasere's avatar iquasere
Browse files

Removed testing artifacts

parent 861ce906
......@@ -36,4 +36,5 @@ recognizer
maxquant
quast
unzip
keggcharter
\ No newline at end of file
keggcharter
samtools
\ No newline at end of file
......@@ -9,7 +9,7 @@ RUN buildDeps='build-essential zlib1g-dev' \
&& conda config --add channels conda-forge \
&& git clone https://github.com/iquasere/MOSCA.git \
&& conda install svn reportlab openpyxl xlrd>=0.9.0 r-rcolorbrewer pandas scikit-learn lxml biopython perl \
&& conda install -c bioconda fastqc sortmerna=2.1 seqtk trimmomatic megahit spades fraggenescan diamond upimapi htseq bowtie2 maxbin2 checkm-genome bioconductor-deseq2=1.22.1 bioconductor-edger=3.24.3 r-pheatmap r-optparse blast krona seqkit \
&& conda install -c bioconda fastqc sortmerna=2.1 seqtk trimmomatic megahit spades fraggenescan diamond upimapi htseq bowtie2 maxbin2 checkm-genome bioconductor-deseq2=1.22.1 bioconductor-edger=3.24.3 r-pheatmap r-optparse blast krona seqkit samtools \
&& conda install -c conda-forge progressbar33 tqdm>=4.33.0 xlsxwriter unzip \
&& conda install -c bioconda -c conda-forge recognizer maxquant keggcharter quast \
&& conda clean --all \
......
......@@ -10,7 +10,7 @@ conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
conda install -y svn reportlab openpyxl xlrd>=0.9.0 r-rcolorbrewer pandas scikit-learn lxml biopython perl
conda install -y -c bioconda fastqc sortmerna=2.1 seqtk trimmomatic megahit spades fraggenescan diamond upimapi htseq bowtie2 checkm-genome bioconductor-edger=3.24.3 r-pheatmap r-optparse blast krona seqkit
conda install -y -c bioconda fastqc sortmerna=2.1 seqtk trimmomatic megahit spades fraggenescan diamond upimapi htseq bowtie2 checkm-genome bioconductor-edger=3.24.3 r-pheatmap r-optparse blast krona seqkit samtools
conda install -y -c bioconda maxbin2
conda install -y -c bioconductor-deseq2=1.22.1
conda install -y -c conda-forge progressbar33 tqdm>=4.33.0 xlsxwriter unzip
......
......@@ -54,6 +54,7 @@ requirements:
- maxquant
- quast
- keggcharter
- samtools
test:
commands:
......
......@@ -38,7 +38,7 @@ parser.add_argument("-o","--output",type=str,help="Directory for storing the res
metavar = "Directory", default = "/MOSCA_analysis")
parser.add_argument("-rd","--resources-directory",type=str,
help="Directory for storing temporary files and databases",
default = sys.path[0] + '/../resources_directory')
default = os.path.expanduser('~/resources_directory'))
parser.add_argument("-nopp","--no-preprocessing",action = "store_true",
help="Don't perform preprocessing", default = False)
parser.add_argument("-noas","--no-assembly",action = "store_true",
......@@ -435,7 +435,7 @@ functional_columns = ['COG general functional category', 'COG functional categor
'COG protein description', 'cog']
for sample in sample2name.keys():
mtools.timed_message('Joining data for sample:{}'.format(sample))
mtools.timed_message('Joining data for sample: {}'.format(sample))
# Join BLAST and reCOGnizer outputs
data = pd.merge(mtools.parse_blast('{}/Annotation/{}/aligned.blast'.format(args.output, sample)),
......@@ -459,7 +459,7 @@ for sample in sample2name.keys():
mtools.normalize_mg_readcounts_by_size(
'{}/Annotation/{}/{}.readcounts'.format(args.output, sample, mg_name),
'{}/Assembly/{}/contigs.fasta'.format(args.output, sample, mg_name))
'{}/Assembly/{}/contigs.fasta'.format(args.output, sample))
data = mtools.add_abundance(data,
'{}/Annotation/{}/{}_normalized.readcounts'.format(args.output, sample, mg_name),
mg_name, origin_of_data = 'metagenomics',
......@@ -475,8 +475,10 @@ for sample in sample2name.keys():
print('Finding consensus COG for each Entry of Sample: {}'.format(sample))
tqdm.pandas()
cogs_df = pd.DataFrame()
cogs_df['cog'] = data.groupby('Entry')['cog'].progress_apply(lambda x:x.value_counts().index[0] if len(x.value_counts().index) > 0 else np.nan)
cogs_df = data.groupby('Entry')['cog'].progress_apply(
lambda x:x.value_counts().index[0] if len(x.value_counts().index) > 0
else np.nan)
cogs_df.reset_index(inplace = True)
cogs_categories = data[functional_columns].drop_duplicates()
# Aggregate information for each Entry, keep UniProt information, sum MG and MT or MP quantification
......@@ -512,14 +514,14 @@ for sample in sample2name.keys():
data.groupby(taxonomy_columns)[mg_name].sum().reset_index()[
[mg_name] + taxonomy_columns].to_csv('{}_{}_tax.tsv'.format(
args.output, mg_name), sep = '\t', index = False, header = False)
mtools.run_command('perl Krona/KronaTools/scripts/ImportText.pl {0}.tsv -o {0}.html'.format(
mtools.run_command('ktImportText {0}.tsv -o {0}.html'.format(
'{}_{}_tax'.format(args.output, mg_name)))
# Draw the functional krona plot
data.groupby(functional_columns)[mg_name].sum().reset_index()[
[mg_name] + functional_columns].to_csv('{}_{}_fun.tsv'.format(
args.output, mg_name), sep = '\t', index = False, header = False)
mtools.run_command('perl Krona/KronaTools/scripts/ImportText.pl {0}.tsv -o {0}.html'.format(
mtools.run_command('ktImportText {0}.tsv -o {0}.html'.format(
'{}_{}_fun'.format(args.output, mg_name)))
'''
......
......@@ -575,11 +575,11 @@ class MoscaTools:
The readcounts by contig in the file will be normalized by contig size
'''
def normalize_mg_readcounts_by_size(self, readcounts, contigs):
self.run_pipe_command('head -n -5 {}'.format(readcounts),
file = readcounts.replace('.readcounts', '_no_tail.readcounts'))
self.run_pipe_command('head -n -5 {}'.format(readcounts), # I still don't know how to pipe two things together for the join command, when I do I'll be able to merge this two commands
output = readcounts.replace('.readcounts', '_no_tail.readcounts'))
self.run_pipe_command("seqkit fx2tab {} | sort | awk '{{print $1\"\\t\"length($2)}}' | join - {} | awk '{{print $1\"\\t\"$3/$2}}'".format(
contigs, readcounts.replace('.readcounts', '_no_tail.readcounts')),
file = readcounts.replace('.readcounts', '_normalized.readcounts'))
output = readcounts.replace('.readcounts', '_normalized.readcounts'))
'''
Input:
......
......@@ -87,7 +87,7 @@ class Reporter:
adapter + '_reverse_paired')
if adapter is not None else ('', 'R1', 'R2')), '[After quality trimming]': (
'quality_trimmed_', 'forward_paired', 'reverse_paired')}
'''
# Initial assessment
self.report.loc[name, '[Initial quality assessment] # of initial reads'] = mtools.count_on_file(
'@', input_file, compressed = True if input_file.endswith('.gz') else False)
......@@ -119,7 +119,7 @@ class Reporter:
# Quality trimming
self.info_from_fastqc(output_dir, name, '[Before quality trimming]', prefix2terms)
'''
self.report.loc[name, '[Quality trimming] Parameters'] = '; '.join([file for file in
set(open('{}/Preprocess/Trimmomatic/{}_quality_params.txt'.format(
output_dir, name)).read().split('\n')) if len(file) > 2]) # TODO - because '' must be interfering, try to cut the problem at the root before troubles
......@@ -137,10 +137,8 @@ class Reporter:
mtname2sample.append([mt_name, pair[1]])
name2sample = mg_name2sample + mtname2sample
name2sample = pd.DataFrame(name2sample, columns = ['Name', 'Sample'])
print(self.report)
self.report = pd.merge(name2sample, self.report, left_on = 'Name',
right_index = True, how = 'outer')
print(self.report)
def info_from_assembly(self, output_dir, sample):
print('Retrieving assembly information for sample ' + sample)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment