Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit c99b7898 authored by Davide Lagoa's avatar Davide Lagoa
Browse files

kegg api created to complete PTS reactions GPR

parent 514c6f73
package pt.uminho.ceb.biosystems.transyt.scraper.APIs;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pt.uminho.ceb.biosystems.transyt.utilities.connection.LinkConnection;
import pt.uminho.ceb.biosystems.transyt.utilities.files.FilesUtils;
public class KeggAPI {
public static final String PATH_LAST_KNOWN_VERSION = FilesUtils.getKeggFastaDirectory().concat("tcdbLastKnownVersion.log");
private static final String KEGG_FASTA_NAME = "keggFasta";
private static final Map<String, String> KOs = FilesUtils.readMapFromFile(FilesUtils.getKOsToSearchFilePath());
public static final String BASE_URL = "http://rest.kegg.jp/get/";
public static final int LIMIT = 5;
public static final int BATCH_SIZE = 10; //KEGG is limited to 10 items per request
public static final int DEFAULT_DELAY_MILLIS = 800;
private static final Logger logger = LoggerFactory.getLogger(KeggAPI.class);
public static Map<String, String> searchKeegPTSGenesAndBuildFastaFiles(){
Map<String, String> results = new HashMap<>();
for(String ko : KOs.keySet()) {
logger.info("Searching KO: " + ko);
try {
LinkConnection connection = getKOInfo(ko);
Map<String, Set<String>> genes = scrapeGenesFromKoInfo(connection.getPage());
Set<String> queries = generateBatchesDistribution(genes);
Map<String, String> sequences = getGenesProteinSequence(queries, ko);
results.putAll(sequences);
} catch (Exception e) {
logger.error("An error occurred retrievig KO data: " + ko);
e.printStackTrace();
}
}
saveResults(results);
return results;
}
public static LinkConnection getKOInfo(String ko) throws InterruptedException {
int attempt = 0;
while(attempt < LIMIT){
try {
String link = BASE_URL + "ko:" + ko;
LinkConnection conn = new LinkConnection();
int code = conn.getCodeConnection(link);
if (code == 200){
return conn;
}
else{
System.out.println(link);
System.out.println(code);
attempt++;
logger.warn("Retrying connection... Attempt nr: {}", attempt);
TimeUnit.SECONDS.sleep(30);
}
}
catch (ArrayIndexOutOfBoundsException e1) {
attempt = LIMIT;
e1.printStackTrace();
logger.error("An error occurred while retrieving entry {}", ko);
logger.trace("StrackTrace: {}", e1);
}
catch (Exception e) {
e.printStackTrace();
attempt++;
logger.warn("Retrying connection... Attempt nr: {}", attempt);
TimeUnit.SECONDS.sleep(30);
logger.trace("StrackTrace: {}", e);
}
}
return null;
}
/**
* Method that searches the AA sequences of each entry
*
* @param queries
* @return
* @throws InterruptedException
*/
public static Map<String, String> getGenesProteinSequence(Set<String> queries, String reference) throws InterruptedException{
Map<String, String> results = new HashMap<>();
Set<String> searched = new HashSet<>();
Set<String> failed = new HashSet<>();
logger.info("Searching protein sequences for each KEGG gene...");
boolean continueSearch = true;
while(continueSearch) {
int lastProgress = -1;
searched = new HashSet<>();
for(String query : queries){
boolean found = false;
int attempt = 0;
while(attempt < LIMIT && !found){
Instant start = Instant.now();
try {
String link = BASE_URL + query;
LinkConnection conn = new LinkConnection();
int code = conn.getCodeConnection(link);
if (code == 200){
results.putAll(scrapeGenesSequences(conn.getPage(), reference));
searched.add(query);
found = true;
int progress = ((searched.size()+failed.size())*100)/queries.size();
if(progress > lastProgress){
lastProgress = progress;
String message = progress + " % search complete";
logger.info(message);
}
applyWait(start);
}
else{
System.out.println(link);
System.out.println(code);
attempt++;
logger.warn("Retrying connection... Attempt nr: {}", attempt);
TimeUnit.SECONDS.sleep(30);
}
}
catch (ArrayIndexOutOfBoundsException e1) {
attempt = LIMIT;
e1.printStackTrace();
logger.error("An error occurred while retrieving entry {}", query);
logger.trace("StrackTrace: {}", e1);
}
catch (Exception e) {
e.printStackTrace();
attempt++;
logger.warn("Retrying connection... Attempt nr: {}", attempt);
TimeUnit.SECONDS.sleep(30);
logger.trace("StrackTrace: {}", e);
}
}
}
continueSearch = false;
}
return results;
}
/**
* Method to retrieve genes of the EC info page
*
* @param in
* @return
* @throws IOException
*/
public static Map<String, String> scrapeGenesSequences(BufferedReader in, String reference) throws IOException {
Map<String, String> res = new HashMap<>();
String html;
boolean read = false;
String entry = null;
String organism = null;
String sequence = "";
while ((html = in.readLine()) != null){
try {
Document doc = Jsoup.parse(html);
String text = doc.body().text().trim();
if(text.matches("^ENTRY\\s+.*")) {
entry = text.split("\\s+")[1].trim();
}
else if(text.matches("^ORGANISM\\s+.*")) {
organism = text.split("\\s+")[1].trim();
}
else if(text.matches("^AASEQ\\s+.*")) {
read = true;
}
else if(text.matches("^NTSEQ\\s+.*")) {
read = false;
String id = "kegg|" + reference + "|" + organism + "_" + entry;
if(KOs.containsKey(reference))
id = id.concat("|" + KOs.get(reference));
res.put(id, sequence.replaceAll("\n", ""));
sequence = "";
}
if(read && !text.matches("^AASEQ\\s+.*")) {
sequence = sequence.concat(text);
}
}
catch (Exception e) {
e.printStackTrace();
}
}
return res;
}
/**
* Method to retrieve genes of the EC info page
*
* @param in
* @return
* @throws IOException
*/
public static Map<String, Set<String>> scrapeGenesFromKoInfo(BufferedReader in) throws IOException {
Map<String, Set<String>> res = new HashMap<>();
String html;
boolean read = false;
while ((html = in.readLine()) != null){
try {
Document doc = Jsoup.parse(html);
String text = doc.body().text().trim();
if(text.matches("^GENES\\s+.*")) {
read = true;
text.replaceAll("^GENES", "");
}
else if(text.matches("^REFERENCE\\s+.*")) {
read = false;
}
if(read) {
String[] line = text.split("\\s+");
String db = null;
for(int i = 0; i < line.length; i++) {
if(i == 0) {
db = line[0].trim().toLowerCase();
if(!res.containsKey(db)) {
res.put(db, new HashSet<>());
}
}
else {
Set<String> genes = res.get(db);
genes.add(line[i].split("\\(")[0].trim());
res.put(db, genes);
}
}
}
}
catch (Exception e) {
e.printStackTrace();
}
}
return res;
}
public static Set<String> generateBatchesDistribution(Map<String, Set<String>> toSearch){
Set<String> queries = new HashSet<>();
String q = "";
int i = 0;
for(String db : toSearch.keySet()) {
Set<String> genes = toSearch.get(db);
for(String gene : genes) {
if(!q.isEmpty())
q = q.concat("+");
q = q.concat(db + gene);
i++;
if(i == BATCH_SIZE) {
queries.add(q.replaceAll("\n", ""));
q = "";
i = 0;
}
}
}
if(!q.isBlank())
queries.add(q.replaceAll("\n", ""));
return queries;
}
/**
* @param start
*/
public static void applyWait(Instant start) {
try {
long timeElapsed = Duration.between(start, Instant.now()).toMillis();
if(timeElapsed < DEFAULT_DELAY_MILLIS)
TimeUnit.MILLISECONDS.sleep(DEFAULT_DELAY_MILLIS - timeElapsed);
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* @param sequences
*/
private static void saveResults(Map<String, String> sequences) {
try {
String filePath = FilesUtils.getKeggFastaDirectory().concat(FilesUtils.generateFileName(KEGG_FASTA_NAME, ".faa"));
FilesUtils.saveLastKnownVersion(PATH_LAST_KNOWN_VERSION, filePath);
File fastaFile = new File(filePath);
FileWriter fstream = new FileWriter(fastaFile);
BufferedWriter out = new BufferedWriter(fstream);
for(String seqID : sequences.keySet()) {
String sequence = sequences.get(seqID);
out.write(">" + seqID + "\n");
out.write(sequence+"\n\n");
}
out.close();
}
catch (Exception e) {
e.printStackTrace();
}
}
}
......@@ -30,7 +30,7 @@ public class MetaCycAPI {
public static final int LIMIT = 5;
public static final int ALL_SEARCH_LIMIT = 2;
public static final int DEFAULT_DELAY_MILLIS = 1000;
public static final int DEFAULT_DELAY_MILLIS = 800;
public static final int BATCH_SIZE = 250;
protected static final String[] DATABASES = new String[] {"ECOLI", "META", "BSUB", "YEAST"};
......
......@@ -116,7 +116,7 @@ public class Compare {
descriptionColumn = i+1;
data = ReadExcelFile.getData("C:\\Users\\Davide\\OneDrive - Universidade do Minho\\UMinho\\Tese\\Internal database\\results.xlsx");
data = ReadExcelFile.getData("C:\\Users\\Davide\\OneDrive - Universidade do Minho\\UMinho\\Tese\\Internal database\\results.xlsx", true, null);
performComparison(prepareInfomation());
......
......@@ -106,6 +106,8 @@ public class GenerateTransportReactions {
newTcContainer.setFamily(tcNumberContainer.getFamily());
tcNumberContainer.filterReactionsNotBelongingToTransportType(evidence);
boolean metacycContainsMiddleCompartment = false;
if(metaCycData.containsKey(accession)) {
......@@ -115,8 +117,19 @@ public class GenerateTransportReactions {
ReactionContainer rContainer = metacycReactions.get(rKey);
rContainer.setConfidenceLevel(METACYC_CONFIDENCE_LEVEL);
rContainer.setOriginalReaction(rContainer.getReaction());
TypeOfTransporter type = FindTransporters.findTypeOfTransport2(rContainer, tcNumber);
TypeOfTransporter type = null;
if(rContainer.getMetabolites().size() == 1)
type = TypeOfTransporter.Uniport;
else
type = FindTransporters.findTypeOfTransport2(rContainer, tcNumber);
if(type.equals(TypeOfTransporter.BiochemicalCoA)) {
String[] reactAux = correctMetaCycCoaReactions(rContainer);
rContainer.setReactant(reactAux[0]);
rContainer.setProduct(reactAux[1]);
}
rContainer.setTransportType(type);
......@@ -128,12 +141,16 @@ public class GenerateTransportReactions {
if(evidence != null)
revEvidence = rContainer.isReversible();
if(!type.equals(TypeOfTransporter.Default))
if(!type.equals(TypeOfTransporter.Default)) {
metacycContainsMiddleCompartment = rContainer.getReaction().contains(ReactionContainer.MIDDLE_COMPARTMENT_TOKEN);
newTcContainer.addReaction(rContainer);
}
}
}
boolean skip = checkIfIgnoreTCDB(tcdbMetContainer.getMetabolites(tcNumber), newTcContainer.getAllReactionsIds().isEmpty());
boolean skip = checkIfIgnoreTCDB(metacycContainsMiddleCompartment, tcdbMetContainer.getMetabolites(tcNumber),
newTcContainer.getAllReactionsIds().isEmpty(),
tcNumberContainer.getTransportTypesAssociatedToProtein());
if(!skip && tcNumberContainer.getAllReactionsIds().size() == 0) {
......@@ -276,6 +293,39 @@ public class GenerateTransportReactions {
}
private static String[] correctMetaCycCoaReactions(ReactionContainer rContainer) {
String[] res = new String[2];
String reactant = rContainer.getReactant();
String product = rContainer.getProduct();
String[] aux = reactant.split("\\s+\\+\\s+");
for(String mAux : aux) {
mAux = mAux.trim();
if(!mAux.equalsIgnoreCase("META:ATP") && !mAux.equalsIgnoreCase("META:CO-A"))
reactant = reactant.replace(mAux, mAux + " " + ReactionContainer.EXTERIOR_COMPARTMENT_TOKEN);
else
reactant = reactant.replace(mAux, mAux + " " + ReactionContainer.INTERIOR_COMPARTMENT_TOKEN);
}
aux = product.split("\\s+\\+\\s+");
for(String mAux : aux) {
mAux = mAux.trim();
product = product.replace(mAux, mAux + " " + ReactionContainer.INTERIOR_COMPARTMENT_TOKEN);
}
res[0] = reactant;
res[1] = product + " + META:PROTON (in)";
return res;
}
/**
* Method to assess if TCDB reactions should be ignored.
*
......@@ -283,7 +333,13 @@ public class GenerateTransportReactions {
* @param currentContainerIsEmpty
* @return
*/
private static boolean checkIfIgnoreTCDB(List<String> metabolites, boolean currentContainerIsEmpty) {
private static boolean checkIfIgnoreTCDB(boolean containsMiddleCompartment, List<String> metabolites, boolean currentContainerIsEmpty, Set<TypeOfTransporter> types) {
if(containsMiddleCompartment)
return true;
if(types.contains(TypeOfTransporter.BiochemicalCoA))
return false;
if(!currentContainerIsEmpty) {
......@@ -544,7 +600,7 @@ public class GenerateTransportReactions {
ReactionContainer container = newTcContainer.getReactionContainer(id);
if(!container.getTransportType().equals(toKeep))
if(!container.getConfidenceLevel().equals(METACYC_CONFIDENCE_LEVEL) && !container.getTransportType().equals(toKeep) && !container.getTransportType().equals(TypeOfTransporter.Biochemical))
newTcContainer.removeReaction(id);
}
}
......
......@@ -422,8 +422,11 @@ public class FindTransporters {
String reactant = reaction.getReactant();
String product = reaction.getProduct();
if(reaction.getReaction().contains(ReactionContainer.MIDDLE_COMPARTMENT_TOKEN))
return TypeOfTransporter.Biochemical;
if(tc.matches("4\\.A\\..+"))
else if(tc.matches("4\\.A\\..+"))
return TypeOfTransporter.PEPdependent;
else if(reactant.contains(":ATP") && reactant.contains(":CO-A"))
......
......@@ -4,10 +4,13 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pt.uminho.ceb.biosystems.transyt.scraper.APIs.KeggAPI;
import pt.uminho.ceb.biosystems.transyt.scraper.APIs.MetaCycAPI;
import pt.uminho.ceb.biosystems.transyt.scraper.tcdb.reactionsGenerator.GenerateTransportReactions;
import pt.uminho.ceb.biosystems.transyt.scraper.tcdb.utilities.ProcessTcdbMetabolitesExcel;
import pt.uminho.ceb.biosystems.transyt.utilities.capsules.ReactionContainer;
......@@ -20,180 +23,218 @@ import pt.uminho.ceb.biosystems.transyt.utilities.files.FilesUtils;
import pt.uminho.ceb.biosystems.transyt.utilities.files.JSONFilesUtils;
public class Retriever {
private static final Logger logger = LoggerFactory.getLogger(Retriever.class);
public static void runRetriever(boolean useCache, boolean tests, String accTest) throws Exception {
try {
///////// - SEARCHER
logger.info("Retrieving TCDB FASTA file...");
if(!useCache)
TcdbRetriever.getSubstrates();
ReadFastaTcdb.buildFastaFileForAlignments();
Set<String> tcNumbers = TcdbExplorer.getTcNumbers(true);