Commit 71e2cd10 authored by iquasere's avatar iquasere
Browse files

Run MT without MG

* MOSCA is now capable of running MT without MG
* Combined with approach to not use assembly
* In the future might implement the use of assembly to analyze MT
Also added rejected and accepted files from rRNA removal to list of files to delete at preprocessing

closes #3
parent 01004a9f
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RSettings">
<option name="interpreterPath" value="C:\Program Files\R\R-4.0.3\bin\R.exe" />
</component>
</project>
\ No newline at end of file
import pandas as pd
import shutil
import sys
from mosca_tools import parse_blast, multi_sheet_excel
scripts_dir = sys.path[0]
......@@ -56,8 +57,7 @@ def all_input(wildcards):
)
else:
return (
expand("{output}/Annotation/{sample}/aligned.blast", output=config["output"],
sample=set(experiments['Sample']))
"{}/MOSCA_Entry_Counts_Report.xlsx".format(config["output"])
)
def preprocess_input(wildcards):
......@@ -83,6 +83,13 @@ def annotation_input(wildcards):
return expand("{output}/Preprocess/piled_{name}.fasta", output = config["output"],
name = wildcards.sample)
def upimapi_input(wildcards):
if config['do_assembly']:
return expand("{output}/Annotation/{sample}/aligned.blast",output=config["output"],
sample = set(experiments['Sample']))
return expand("{output}/Annotation/{name}/aligned.blast",output=config["output"],
name = set(experiments['Name']))
rule all:
input:
all_input
......@@ -184,16 +191,18 @@ rule annotation:
rule upimapi:
input:
expand("{output}/Annotation/{sample}/aligned.blast", output = config["output"],
sample = set(experiments["Sample"]))
upimapi_input
output:
expand("{output}/Annotation/uniprotinfo.tsv", output = config["output"])
expand("{output}/Annotation/uniprotinfo.tsv", output = config["output"]),
threads:
1
run:
shell("upimapi.py -i {input} -o {output}/Annotation/uniprotinfo --blast --full-id",
output = config["output"])
shell("echo 'done' > {output}/Annotation/{{sample}}.txt", output = config["output"])
if config["do_assembly"]:
shell("upimapi.py -i {input} -o {output}/Annotation/uniprotinfo --blast --full-id", output = config["output"])
else:
for blast in input:
shell("upimapi.py -i {blast} -o {output}/Annotation/uniprotinfo --blast --full-id",
output=config["output"])
rule recognizer:
input:
......@@ -259,7 +268,7 @@ rule join_information:
expand("{output}/Metatranscriptomics/{name}.readcounts", output = config["output"],
name = set(mt_experiments['Name'])),
expand("{output}/Annotation/{name}.readcounts", output = config["output"],
name = set(mg_experiments['Name'])),
name = set(mg_experiments['Name']))
output:
expand("{output}/MOSCA_Protein_Report.xlsx", output = config["output"]),
expand("{output}/MOSCA_Entry_Report.xlsx", output = config["output"]),
......@@ -272,6 +281,30 @@ rule join_information:
experiments = config["experiments"], output = config["output"],
normalization_method = config["normalization_method"])
rule entry_count:
input:
uniprotinfo=expand("{output}/Annotation/uniprotinfo.tsv",output=config["output"]),
blasts=expand("{output}/Annotation/{name}/aligned.blast",output=config["output"], name=set(experiments['Name']))
output:
"{}/MOSCA_Entry_Counts_Report.xlsx".format(config["output"])
threads:
1
run:
uniprotinfo=pd.read_csv(input.uniprotinfo[0], sep='\t')
result = pd.DataFrame(columns=['sseqid'])
i = 1
for blast in input.blasts:
print('[{}/{}] Quantifying entries for: {}'.format(i, len(input.blasts), blast))
data = parse_blast(blast).groupby('sseqid').size().reset_index(name=blast.split('/')[-2])
data['sseqid'] = [ide.split('|')[1] if ide != '*' else ide for ide in data['sseqid']]
result = pd.merge(result, data, on='sseqid', how='outer')
i += 1
result.columns = ['Entry'] + result.columns.to_list()[1:]
print('Merging entry counts with info at {}'.format(input.uniprotinfo[0]))
result = pd.merge(result, uniprotinfo, on='Entry', how='left')
multi_sheet_excel("{}/MOSCA_Entry_Counts_Report.xlsx".format(config["output"]), result, sheet_name='Sheet')
result.to_csv("{}/MOSCA_Entry_Counts_Report.tsv".format(config["output"]), index=False, sep='\t')
rule differential_expression:
input:
expand("{output}/Metatranscriptomics/expression_matrix.tsv", output = config["output"])
......
{% set name = "mosca" %}
{% set version = "1.2.3" %}
{% set version = "1.3.0" %}
{% set sha256 = "87cbca039ea9b9c85f417543f2426b2b2acffebe58179878ee6872a32ae949ba" %}
package:
......
......@@ -6,7 +6,7 @@ import argparse
import multiprocessing
import sys
__version__ = '1.2.3'
__version__ = '1.3.0'
parser = argparse.ArgumentParser(description="MOSCA's main script")
parser.add_argument("-s", "--snakefile", type=str, default="{}/Snakefile".format(sys.path[0]), help="Snakefile file")
......
......@@ -229,7 +229,6 @@ class Preprocesser:
self.remove_orphans(basename + '_forward.fastq',
basename + '_reverse.fastq')
'''
for file in files_to_delete:
os.remove(file)
print('Removed: {}'.format(file))
......@@ -356,9 +355,14 @@ class Preprocesser:
if self.paired:
args.input = ['{}/SortMeRNA/{}_{}.fastq'.format(
args.output, name, fr) for fr in ['forward', 'reverse']]
files_to_delete = ['{}/SortMeRNA/{}_{}.fastq'.format(
args.output, name, rejection) for rejection in ['accepted', 'rejected']]
else:
args.input = ['{}/SortMeRNA/{}_rejected.fq'.format(
args.output, name)]
files_to_delete = ['{}/SortMeRNA/{}_{}.fastq'.format(args.output, name, 'accepted')]
for file in files_to_delete:
os.remove(file)
self.run_fastqc(args.input, '{}/FastQC'.format(args.output), threads=args.threads)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment