Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 9a22eefa authored by Davide Lagoa's avatar Davide Lagoa
Browse files

major refactoring, blast moved to service, relative paths eliminated

parent af468ddc
This diff is collapsed.
This diff is collapsed.
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>transyt</groupId>
<artifactId>transyt-scraper</artifactId>
<groupId>pt.uminho.ceb.biosystems.transyt</groupId>
<artifactId>scraper</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>transyt-scraper</name>
<name>scraper</name>
<url>http://maven.apache.org</url>
<properties>
......@@ -26,9 +27,31 @@
<release>10</release>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>pt.uminho.ceb.biosystems.transyt.scraper.tcdb.tcdbTransportTypesRetriever.Retriever</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>ebi-repo</id>
......@@ -37,12 +60,12 @@
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>transyt</groupId>
<artifactId>transyt-utilities</artifactId>
<groupId>pt.uminho.ceb.biosystems.transyt</groupId>
<artifactId>utilities</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
......@@ -108,20 +131,20 @@
<artifactId>json-simple</artifactId>
<version>1.1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-server -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>3.9.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/uk.ac.ebi.chebi.webapps.chebiWS.client/chebiWS-client -->
<dependency>
<groupId>uk.ac.ebi.chebi.webapps.chebiWS.client</groupId>
<artifactId>chebiWS-client</artifactId>
<version>2.4</version>
</dependency>
</dependencies>
</project>
package blast;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Observable;
import java.util.Observer;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
import org.biojava.nbio.core.sequence.io.FastaReader;
import org.biojava.nbio.core.sequence.io.FastaReaderHelper;
import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser;
import org.biojava.nbio.core.sequence.io.ProteinSequenceCreator;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import connection.TcdbExplorer;
import pt.uminho.ceb.biosystems.merlin.bioapis.externalAPI.ncbi.CreateGenomeFile;
import pt.uminho.ceb.biosystems.merlin.local.alignments.core.RunSimilaritySearch;
import pt.uminho.ceb.biosystems.merlin.utilities.Enumerators.AlignmentScoreType;
import pt.uminho.ceb.biosystems.merlin.utilities.Enumerators.Method;
import pt.uminho.ceb.biosystems.merlin.utilities.containers.capsules.AlignmentCapsule;
import utilities.triage_utilities.Properties;
/**
* @author Davide
*
*/
public class Blast implements Observer{
private static final Logger logger = LoggerFactory.getLogger(Blast.class);
private ConcurrentLinkedQueue<AlignmentCapsule> results;
private String currentTempDirectory;
private Integer queryFileSize = 0;
private Properties properties;
private String queryFilePath;
private boolean forceBlast;
public Blast(boolean forceBlast, String queryFilePath, Properties properties) {
try {
this.forceBlast = forceBlast;
logger.info("Blast process initializing...");
this.properties = properties;
this.queryFilePath = queryFilePath;
currentTempDirectory = properties.getCurrentTempDirectory();
results = performBlast();
logger.info("Blast process finished!");
}
catch(FileNotFoundException e1) {
logger.error("The genome file does not exist in the given path!!");
}
catch (Exception e) {
e.printStackTrace();
}
}
/**
* Method were all configurations to perform BLAST are set.
*
* @return
* @throws Exception
*/
private ConcurrentLinkedQueue<AlignmentCapsule> performBlast() throws Exception {
//BLAST
// String filePath = "C:/Users/Davide/Documents/reactionsBuilderTriage/temp/";
String tcdbFastaFile = currentTempDirectory.concat("tcdbSEQs.txt");
// String queryFastaFile = "C:\\Users\\Davide\\Downloads\\NC_all.txt";
// String queryFastaFile = "C:\\Users\\Davide\\Downloads\\GCF_001951175.1_ASM195117v1_protein.faa\\GCF_001951175.1_ASM195117v1_protein.faa";
// String queryFastaFile = "C:\\Users\\Davide\\OneDrive - Universidade do Minho\\UMinho\\Tese\\KBase\\Genomes\\Bacillus glycinifermentans\\GCF_900093775.1_EVONIK_BGLY_protein.faa";
logger.info("Downloading FASTA file from TCDB at: "); //incluir endereço e excepçoes caso nao consiga aceder à net
Map<String, AbstractSequence<?>> tcdbGenes = getTcdbInMapFormat();
logger.debug("Saving TCDB fasta in local folder: {}", tcdbFastaFile); //indicar o caminho
CreateGenomeFile.buildFastaFile(tcdbFastaFile, tcdbGenes);
logger.debug("Reading given target genome FASTA at: {}", queryFilePath); //indicar caminho
System.out.println("temp" + "\t" + currentTempDirectory);
System.out.println(queryFilePath);
ConcurrentHashMap<String, AbstractSequence<?>> sequences= new ConcurrentHashMap<String, AbstractSequence<?>>();
sequences.putAll(FastaReaderHelper.readFastaProteinSequence(new File(queryFilePath)));
queryFileSize = sequences.size();
logger.info("Blast process initialized!");
RunSimilaritySearchTriage run_similaritySearch = new RunSimilaritySearchTriage(forceBlast, tcdbGenes, 0.3,
Method.SmithWaterman, sequences, new AtomicBoolean(false), new AtomicInteger(0), AlignmentScoreType.ALIGNMENT);
run_similaritySearch.setSubjectFastaFilePath(tcdbFastaFile);
run_similaritySearch.addObserver(this);
run_similaritySearch.setWorkspaceTaxonomyFolderPath(currentTempDirectory);
ConcurrentLinkedQueue<AlignmentCapsule> results = null;
if(sequences.keySet().size()>0)
results = run_similaritySearch.runBlastSearch(true, properties.getBlastEvalueThreshold(), properties.getBitScore(), properties.getQueryCoverage());
return results;
}
/**
*
* @param url
* @return
* @throws Exception
*/
public static Map<String, AbstractSequence<?>> getTcdbInMapFormat() throws Exception {
InputStream tcdbInputStream = (new URL(TcdbExplorer.TCDB_FASTA_URL)).openStream();
BufferedReader br= new BufferedReader(new InputStreamReader(tcdbInputStream));
StringBuilder sb = new StringBuilder();
String line;
while ((line = br.readLine()) != null)
sb.append(line.concat("\n"));
String theString = sb.toString().replace("</p>", "").replace("<p>", "").replace(">gnl|TC-DB|xxxxxx 3.A.1.205.14 \ndsfgdfg", "");
byte[] bytes = theString.getBytes("utf-8");
tcdbInputStream = new ByteArrayInputStream(bytes);
FastaReader<ProteinSequence,AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence,AminoAcidCompound>(
tcdbInputStream,
//tcdbFile,
new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(),
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
// System.out.println("CHECK3---->"+LocalDateTime.now().getHour()+":"+LocalDateTime.now().getMinute()+":"+LocalDateTime.now().getSecond());
Map<String, AbstractSequence<?>> tcdb = new HashMap<>();
tcdb.putAll(fastaReader.process());
return tcdb;
}
@Override
public void update(Observable arg0, Object arg1) {
// TODO Auto-generated method stub
}
/**
* @return
*/
public Map<String,List<AlignmentCapsule>> getAlignmentsByQuery(){
Map<String,List<AlignmentCapsule>> alignmentMap = new HashMap<>();
for(AlignmentCapsule alignContainer : this.results){
String query = alignContainer.getQuery();
if(alignmentMap.containsKey(query)){
alignmentMap.get(query).add(alignContainer);
}
else{
List<AlignmentCapsule> containersList = new ArrayList<>();
containersList.add(alignContainer);
alignmentMap.put(query, containersList);
}
}
return alignmentMap;
}
/**
* @return the results
*/
public ConcurrentLinkedQueue<AlignmentCapsule> getResults() {
return results;
}
/**
* @return the queryFileSize
*/
public Integer getQueryFileSize() {
return queryFileSize;
}
}
package blast;
import java.io.File;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Observable;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.xml.bind.JAXBContext;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pt.uminho.ceb.biosystems.merlin.local.alignments.core.AlignmentsUtils;
import pt.uminho.ceb.biosystems.merlin.local.alignments.core.ModelMerge.ModelAlignments;
import pt.uminho.ceb.biosystems.merlin.utilities.Enumerators.AlignmentPurpose;
import pt.uminho.ceb.biosystems.merlin.utilities.blast.ncbi_blastparser.BlastIterationData;
import pt.uminho.ceb.biosystems.merlin.utilities.blast.ncbi_blastparser.Hit;
import pt.uminho.ceb.biosystems.merlin.utilities.blast.ncbi_blastparser.NcbiBlastParser;
import pt.uminho.ceb.biosystems.merlin.utilities.containers.capsules.AlignmentCapsule;
/**
* @author amaromorais
*
*/
public class BlastAlignmentTriage extends Observable implements ModelAlignments{
// private static final double FIXED_THRESHOLD = 1E-6;
// private static final double ALIGNMENT_MIN_SCORE = 0.0;
// private static final double BITSCORE_THRESHOLD = 50;
// private static final double COVERAGE_THRESHOLD = 0.20;
// private static final double ALIGNMENT_QUERY_LEN_THRESHOLD = 0.25;
// private static final double QUERY_HIT_LEN_THRESHOLD = 0.25;
private NcbiBlastParser blout;
private ConcurrentLinkedQueue<AlignmentCapsule> alignmentContainerSet;
private String alignmentMatrix, queryFasta, subjectFasta, blastOutputFolderPath;
private boolean isTransportersSearch = false;
private AtomicBoolean cancel;
private Map<String,AbstractSequence<?>> querySequences;
private JAXBContext jc;
private String ec_number;
private Map<String,Set<String>> closestOrthologs;
private Map<String,Set<Integer>> modules;
private ConcurrentLinkedQueue<String> sequencesWithoutSimilarities;
private AlignmentPurpose blastPurpose;
private double threshold;
private double evalueThreshold;
private double bitScoreThreshold;
private double queryCoverageThreshold;
private double alignmentMinScore;
private Double referenceTaxonomyThreshold;
private Map<String, List<String>> sequenceIdsSet;
private Map<String, Integer> kegg_taxonomy_scores;
private Integer referenceTaxonomyScore;
private boolean forceBlast;
final static Logger logger = LoggerFactory.getLogger(BlastAlignmentTriage.class);
/**
* Default values for evalueThreshold(1E-6), bitScoreThreshold(50), queryCoverageThreshold(0.80) and alignmentMinScore(0.0);
*
* @param queryFasta
* @param subjectFasta
* @param querySequences
* @param treshold
* @param transportersSearch
* @param cancel
* @param alignmentContainerSet
* @param jc
*/
public BlastAlignmentTriage(boolean forceBlast, String queryFasta, String subjectFasta, Map<String,AbstractSequence<?>> querySequences, double treshold, boolean transportersSearch, AtomicBoolean cancel, ConcurrentLinkedQueue<AlignmentCapsule> alignmentContainerSet, JAXBContext jc){
this.forceBlast = forceBlast;
this.setEvalueThreshold(1E-6);
this.setBitScoreThreshold(50);
this.setQueryCoverageThreshold(0.80);
this.setAlignmentMinScore(0);
this.queryFasta = queryFasta;
this.subjectFasta = subjectFasta;
this.threshold = treshold;
this.isTransportersSearch = transportersSearch;
this.querySequences = querySequences;
this.alignmentContainerSet = alignmentContainerSet;
this.cancel = cancel;
this.jc = jc;
}
/**
* Default value for alignmentMinScore(0.0);
*
* @param queryFasta
* @param subjectFasta
* @param querySequences
* @param treshold
* @param evalueThreshold
* @param bitScoreThreshold
* @param queryCoverageThreshold
* @param transportersSearch
* @param cancel
* @param alignmentContainerSet
* @param jc
*/
public BlastAlignmentTriage(boolean forceBlast, String queryFasta, String subjectFasta, Map<String,AbstractSequence<?>> querySequences, double treshold, double evalueThreshold,
double bitScoreThreshold, double queryCoverageThreshold, boolean transportersSearch, AtomicBoolean cancel, ConcurrentLinkedQueue<AlignmentCapsule> alignmentContainerSet, JAXBContext jc){
this.forceBlast = forceBlast;
this.setEvalueThreshold(evalueThreshold);
this.setBitScoreThreshold(bitScoreThreshold);
this.setQueryCoverageThreshold(queryCoverageThreshold);
this.setAlignmentMinScore(0.0);
this.queryFasta = queryFasta;
this.subjectFasta = subjectFasta;
this.threshold = treshold;
this.isTransportersSearch = transportersSearch;
this.querySequences = querySequences;
this.alignmentContainerSet = alignmentContainerSet;
this.cancel = cancel;
this.jc = jc;
}
public void run(){
if(!this.cancel.get()) {
try {
File tcdbfile = new File(subjectFasta);
String outputFileName = queryFasta.substring(queryFasta.lastIndexOf("/")).replace(".faa", "").concat("_blastReport.xml");
if(isTransportersSearch)
outputFileName = outputFileName.replace(".xml", "_transporters.xml");
File outputFile;
if(this.blastOutputFolderPath!=null && !this.blastOutputFolderPath.isEmpty())
outputFile = new File(this.blastOutputFolderPath.concat(outputFileName));
else
outputFile = new File(tcdbfile.getParent().concat("\\..\\").concat("reports").concat(outputFileName));
outputFile.getParentFile().mkdirs();
// System.out.println(outputFile.getAbsolutePath());
if(forceBlast) {
Process p = Runtime.getRuntime().exec("blastp -query " + this.queryFasta + " -subject "
+ this.subjectFasta + " -out " + outputFile.getAbsolutePath() + " -outfmt 5");
p.waitFor();
}
if(outputFile.exists()){
this.blout = new NcbiBlastParser(outputFile, this.jc);
this.alignmentMatrix = blout.getMatrix();
buildAlignmentCapsules();
}
else{
logger.warn("blast output .xml file wasn't generated on {}", outputFile.getAbsolutePath());
}
}
catch (UnknownHostException e2) {
logger.error("NCBI service failed. Please try again. Shuting down...");
System.exit(0);
}
catch (IOException | InterruptedException e) {
e.printStackTrace();
}
catch (OutOfMemoryError oue) {
oue.printStackTrace();
}
System.gc();
setChanged();
notifyObservers();
}
setChanged();
notifyObservers();
}
public void buildAlignmentCapsules(){
List<BlastIterationData> iterations = this.blout.getResults();
Map<String, Double> queriesMaxScores = AlignmentsUtils.getSequencesAlignmentMaxScoreMap(querySequences, alignmentMatrix);
// System.out.println("querySequences----->"+querySequences);
// System.out.println(querySequences.keySet()+"\t"+querySequences.size());
// System.out.println("queriesMaxScores----->"+queriesMaxScores);
// System.out.println(queriesMaxScores.keySet()+"\t"+querySequences.size());
for(BlastIterationData iteration : iterations){
String queryID = iteration.getQueryDef().trim();
Integer queryLength = iteration.getQueryLen();
String [] query_array;
String query_org = "";
String queryLocus = "";
if(queryID.contains(":")) {
query_array = queryID.split(":");
query_org = query_array [0].trim();
queryLocus = query_array[1].trim();
}
else {
if(queryID.contains(" ")) {
queryID = new StringTokenizer(queryID," ").nextToken();
}
if(this.blastPurpose!=null && this.blastPurpose.equals(AlignmentPurpose.ORTHOLOGS)) {
for(String seqID : this.querySequences.keySet()) {
if(seqID.contains(queryID)) {
queryID = seqID;
query_array = queryID.split(":");
query_org = query_array [0].trim();
queryLocus = query_array[1].trim();
}
}
}
}
if(this.blastPurpose==null || !this.blastPurpose.equals(AlignmentPurpose.ORTHOLOGS) || (!this.sequenceIdsSet.containsKey(queryLocus) || sequenceIdsSet.get(queryLocus).isEmpty())){
// System.out.println("QUERY----->"+queryID);
double maxScore = queriesMaxScores.get(iteration.getQueryDef().trim());
double specificThreshold = this.threshold;
if(this.kegg_taxonomy_scores!=null && this.referenceTaxonomyScore!=null && this.referenceTaxonomyThreshold!=null)
if(this.kegg_taxonomy_scores.get(query_org)>=this.referenceTaxonomyScore)
specificThreshold = this.referenceTaxonomyThreshold;
List<Hit> hits = iteration.getHits();
if(hits!=null && !hits.isEmpty()){
for(Hit hit : hits){
if(!this.cancel.get()){
try {
String tcdbID = "";
String hitNum = hit.getHitNum();
String target = hit.getHitId();
Integer targetLength = iteration.getHitLength(hitNum);
Integer alignmentLength = iteration.getHitAlignmentLength(hitNum);
double alignmentScore = (iteration.getHitScore(hit)-this.alignmentMinScore)/(maxScore-this.alignmentMinScore);//alignmentMethod.getSimilarity(); //(((double)alignmentMethod.getScore()-alignmentMethod.getMinScore())/(alignmentMethod.getMaxScore()-alignmentMethod.getMinScore()))
//double similarityScore = iteration.getPositivesScore(hitNum);
//double identityScore = iteration.getIdentityScore(hitNum);
double bitScore = iteration.getHitBitScore(hit);
double eValue = iteration.getHitEvalue(hit);
double queryCoverage = iteration.getHitQueryCoverage(hitNum);//(double)(alingmentLength-iteration.getHitAlignmentGaps(hitNum))/(double)queryLength;
double tragetCoverage = iteration.getHiTargetCoverage(hitNum);//(double)(alingmentLength-iteration.getHitAlignmentGaps(hitNum))/(double)targetLength;
// double l1 = (double)queryLength/(double)targetLength;
// double l2 = (double)alingmentLength/(double)queryLength;
// double l3 = (double)alingmentLength/(double)targetLength;
double score = alignmentScore;//-1;
<