Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit fc9cd681 authored by Davide Lagoa's avatar Davide Lagoa
Browse files

several updates

parent db3ad775
......@@ -64,24 +64,29 @@ public class WriteByMetabolitesID {
BiosynthMetabolites namesAndIDsContainer = getBiosynthDataByName(service, true); //154225 names
// BiosynthMetabolites namesAndIDsContainer = null;
// FileUtils.saveMapInFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs3.txt", namesAndIDsContainer.getMetabolitesIDs());
// FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase3.txt", namesAndIDsContainer.getNamesLowerCase());
// FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns3.txt", namesAndIDsContainer.getNamesLowerCaseWithoutSigns());
// FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns3.txt", namesAndIDsContainer.getNamesWithoutSigns());
Map<String, Set<TcNumberContainer2>> reactionsData = JSONFilesUtils.readJSONtcdbReactionsFile();
// Map<String, Set<TcNumberContainer2>> reactionsData2 = new HashMap<>();
//
// reactionsData2.put("O51235", reactionsData.get("O51235"));
//
// IdentifyReactionsMetabolites metabolitesIdentification = new IdentifyReactionsMetabolites(reactionsData2, namesAndIDsContainer, service);
//
// Map<String, Map<String, MetaboliteMajorLabel>> tcdbMetabolitesIDs = metabolitesIdentification.getTcdbMetabolitesIDs();
//
// Map<String, String[]> forChildsSearch = metabolitesIdentification.getforChildsSearch();
int n = 1;
while (n != 99) {
if(n == 100)
data = getBiosynthDBData(service);
else if(n == 101)
namesAndIDsContainer = getBiosynthDataByName(service, true);
else if(n == 102) {
data = getBiosynthDBData(service);
namesAndIDsContainer = getBiosynthDataByName(service, true);
}
test(data, service, namesAndIDsContainer);
test(namesAndIDsContainer, data, service, null, reactionsData, null);
System.out.println("Enter a random number to repeat (100 to repeat data retrieval) or 99 to finish: ");
......@@ -105,23 +110,33 @@ public class WriteByMetabolitesID {
}
}
public static void test(Map<String, BiosynthMetaboliteProperties> data, BiodbGraphDatabaseService service, BiosynthMetabolites namesAndIDsContainer) {
public static void test(BiosynthMetabolites namesAndIDsContainer, Map<String, BiosynthMetaboliteProperties> data, BiodbGraphDatabaseService service, Map<String, Map<String, MetaboliteMajorLabel>> tcdbMetabolitesIDs,
Map<String, Set<TcNumberContainer2>> reactionsData2, Map<String, String[]> forChildsSearch) {
try {
/////TRIAGE
// Map<String, Set<TcNumberContainer2>> reactionsData2 = new HashMap<>();
//
// reactionsData2.put("Q1D027", reactionsData.get("Q1D027"));
Map<String, Set<TcNumberContainer2>> reactionsData = JSONFilesUtils.readJSONtcdbReactionsFile();
IdentifyReactionsMetabolites metabolitesIdentification = new IdentifyReactionsMetabolites(reactionsData2, namesAndIDsContainer, service);
IdentifyReactionsMetabolites metabolitesIdentification = new IdentifyReactionsMetabolites(reactionsData, namesAndIDsContainer, service);
tcdbMetabolitesIDs = metabolitesIdentification.getTcdbMetabolitesIDs();
Map<String, Map<String, MetaboliteMajorLabel>> tcdbMetabolitesIDs = metabolitesIdentification.getTcdbMetabolitesIDs();
forChildsSearch = metabolitesIdentification.getforChildsSearch();
Map<String, String[]> forChildsSearch = metabolitesIdentification.getforChildsSearch();
// Map<String, Set<TcNumberContainer2>> reactionsData3 = new HashMap<>();
Map<String, Set<TcNumberContainer2>> newData = new TransportReactionsBuilder(-1, true, tcdbMetabolitesIDs, reactionsData, forChildsSearch, service, data).getResults();
// reactionsData3.put("O51235", reactionsData.get("O51235"));
JSONFilesUtils.writeJSONTriageReactions(newData);
/////TRIAGE
Map<String, Set<TcNumberContainer2>> newData = new TransportReactionsBuilder(tcdbMetabolitesIDs, reactionsData2, forChildsSearch, service, data).getResults();
// JSONFilesUtils.writeJSONTriageReactions(newData);
// Map<String, Set<TcNumberContainer2>> newData2 = new Has
new PopulateTriageNeo4jDatabase(newData);
////COUNTS
......@@ -162,7 +177,7 @@ public class WriteByMetabolitesID {
//
// }
// }
//
// for(String key : countsMap.keySet())
// System.out.println(key + "\t" + countsMap.get(key));
......@@ -183,8 +198,6 @@ public class WriteByMetabolitesID {
// newData2.put("P54862", newData.get("P54862"));
// newData2.put("P27243", newData.get("P27243"));
// new PopulateTriageNeo4jDatabase(newData);
/////TRIAGE
......@@ -209,10 +222,10 @@ public class WriteByMetabolitesID {
//
//
// FileUtils.saveMapInFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs2.txt", namesAndIDsContainer.getMetabolitesIDs());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase2.txt", namesAndIDsContainer.getNamesLowerCase());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns2.txt", namesAndIDsContainer.getNamesLowerCaseWithoutSigns());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns2.txt", namesAndIDsContainer.getNamesWithoutSigns());
// FileUtils.saveMapInFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs3.txt", namesAndIDsContainer.getMetabolitesIDs());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase3.txt", namesAndIDsContainer.getNamesLowerCase());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns3.txt", namesAndIDsContainer.getNamesLowerCaseWithoutSigns());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns3.txt", namesAndIDsContainer.getNamesWithoutSigns());
// System.out.println(namesAndIDsContainer.getMetabolitesIDs().get("Cu2+"));
......@@ -554,6 +567,7 @@ public class WriteByMetabolitesID {
e.printStackTrace();
}
return compounds;
}
......@@ -577,11 +591,11 @@ public class WriteByMetabolitesID {
if(useCache) {
namesLowerCaseWithoutSigns = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns2.txt");
namesWithoutSigns = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns2.txt");
namesLowerCase = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase2.txt");
namesLowerCaseWithoutSigns = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns3.txt");
namesWithoutSigns = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns3.txt");
namesLowerCase = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase3.txt");
compounds = FileUtils.readMapFromFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs2.txt");
compounds = FileUtils.readMapFromFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs3.txt");
return new BiosynthMetabolites(compounds, namesLowerCaseWithoutSigns, namesWithoutSigns, namesLowerCase);
}
......@@ -619,8 +633,13 @@ public class WriteByMetabolitesID {
// System.out.println("found ecoli >>>> " + names);
if(!names.isEmpty()) {
MetaboliteMajorLabel label = MetaboliteMajorLabel.valueOf(nodeProperties.get("major_label").toString());
MetaboliteMajorLabel label;
if(entryID.contains("ECOLI:"))
label = MetaboliteMajorLabel.EcoCyc;
else
label = MetaboliteMajorLabel.valueOf(nodeProperties.get("major_label").toString());
if(nodeProperties.containsKey("formula"))
formula = nodeProperties.get("formula").toString();
......@@ -743,6 +762,13 @@ public class WriteByMetabolitesID {
e.printStackTrace();
}
if(compounds.containsKey("acyl-coenzyme a"))
compounds.get("acyl-coenzyme a").put(MetaboliteMajorLabel.MetaCyc, "META:ACYL-COA");
if(compounds.containsKey("acyl-Coenzyme a"))
compounds.get("acyl-Coenzyme a").put(MetaboliteMajorLabel.MetaCyc, "META:ACYL-COA");
return new BiosynthMetabolites(compounds, namesLowerCaseWithoutSigns, namesWithoutSigns, namesLowerCase);
}
......
......@@ -41,11 +41,13 @@ public class IdentifyReactionsMetabolites {
this.dictionary = new Synonyms();
Set<String> tcdbMetabolites = getMetabolitesFromReactions(reactionsData);
// Set<String> tcdbMetabolites = getMetabolitesFromReactions(reactionsData);
metabolites = getMetabolitesFromReactions(reactionsData).keySet();
logger.info("Total metabolites for search: {}", tcdbMetabolites.size());
logger.info("Total metabolites for search: {}", metabolites.size());
getMetabolitesIDs(tcdbMetabolites, namesAndIDsContainer, service);
getMetabolitesIDs(namesAndIDsContainer, service);
logger.info("Total found: {}", tcdbMetabolitesIDs.size());
......@@ -58,14 +60,14 @@ public class IdentifyReactionsMetabolites {
* @param allMetabolitesByName
* @return
*/
public Map<String, Map<String, MetaboliteMajorLabel>> getMetabolitesIDs(Set<String> metabolitesFromFile, BiosynthMetabolites namesAndIDsContainer, BiodbGraphDatabaseService service) {
public Map<String, Map<String, MetaboliteMajorLabel>> getMetabolitesIDs(BiosynthMetabolites namesAndIDsContainer, BiodbGraphDatabaseService service) {
tcdbMetabolitesIDs = new HashMap<>();
this.forChildsSearch = new HashMap<>();
// Set<String> forSearch = new HashSet<>(metabolites);
metabolites = new HashSet<>(standardizationOfNames1(metabolitesFromFile));
// metabolites = new HashSet<>(standardizationOfNames1(metabolitesFromFile, dictionary));
namesAndIDsContainer = standardizationOfNames2(namesAndIDsContainer);
......@@ -115,12 +117,15 @@ public class IdentifyReactionsMetabolites {
private void saveMetabolite(String metabolite, Map<MetaboliteMajorLabel, String> ids) {
MetaboliteMajorLabel id = selectMetaboliteMajorLabel(metabolite, ids);
if(id != null) {
Map<String, MetaboliteMajorLabel> map = new HashMap<>();
map.put(ids.get(id), id);
if(id.equals(MetaboliteMajorLabel.EcoCyc))
map.put(ids.get(id), MetaboliteMajorLabel.MetaCyc);
else
map.put(ids.get(id), id);
tcdbMetabolitesIDs.put(metabolite, map);
metabolites.remove(metabolite);
......@@ -342,9 +347,9 @@ public class IdentifyReactionsMetabolites {
* @param data
* @return
*/
private Set<String> getMetabolitesFromReactions(Map<String, Set<TcNumberContainer2>> data){
private Map<String, Set<String>> getMetabolitesFromReactions(Map<String, Set<TcNumberContainer2>> data){
Set<String> metabolites = new HashSet<>();
Map<String, Set<String>> metabolites = new HashMap<>();
for(String accession : data.keySet()) {
......@@ -356,7 +361,7 @@ public class IdentifyReactionsMetabolites {
String reaction = reactionContainer.getReaction();
metabolites.addAll(getMetabolitesToBeReplaced(reaction));
metabolites.putAll(getMetabolitesToBeReplaced(reaction, dictionary));
}
}
}
......@@ -371,7 +376,7 @@ public class IdentifyReactionsMetabolites {
* @param reaction
* @return
*/
public static Set<String> getMetabolitesToBeReplaced(String reaction){
public static Map<String, Set<String>> getMetabolitesToBeReplaced(String reaction, Synonyms dictionary){
Set<String> metabolites = new HashSet<>();
......@@ -382,13 +387,11 @@ public class IdentifyReactionsMetabolites {
for(String metab : metabs) {
metab = metab.replaceAll("^(\\+\\s)", "");
metab = metab.replaceAll("^(\\+\\s)", "").replaceAll("^(\\d+)", "");
metabolites.add(metab.trim());
}
return metabolites;
return standardizationOfNames3(metabolites, dictionary);
}
/**
......@@ -397,12 +400,10 @@ public class IdentifyReactionsMetabolites {
* @param metabolites
* @return
*/
private static Set<String> standardizationOfNames1(Set<String> metabolites) {
private static Set<String> standardizationOfNames1(Set<String> metabolites, Synonyms dictionary) {
Set<String> standardNames = new HashSet<>();
Synonyms dictionary = new Synonyms();
for(String metabolite : metabolites) {
if(metabolite.matches("(?i).+(-P)$"))
......@@ -475,6 +476,52 @@ public class IdentifyReactionsMetabolites {
return namesAndIDsContainer;
}
/**
* Auxiliar method for normalization of names.
*
* @param metabolites
* @return
*/
private static Map<String, Set<String>> standardizationOfNames3(Set<String> metabolites, Synonyms dictionary) {
Map<String, Set<String>> standardNames = new HashMap<>();
for(String metabolite : metabolites) {
String original = metabolite;
if(metabolite.matches("(?i).+(-P)$"))
metabolite = metabolite.replaceAll("(?i)(-P)$", "\\sphosphate");
metabolite = metabolite.replaceAll("(?i)(ic acids*)", "ate");
String word = dictionary.getSynonym(metabolite.replaceAll("\\s+", "").toLowerCase());
if(word != null) {
if(standardNames.containsKey(word))
standardNames.get(word).add(original);
else {
Set<String> set = new HashSet<>();
set.add(original);
standardNames.put(word, set);
}
}
else {
if(standardNames.containsKey(metabolite))
standardNames.get(metabolite).add(original);
else {
Set<String> set = new HashSet<>();
set.add(original);
standardNames.put(metabolite, set);
}
}
}
return standardNames;
}
/**
* Get the best database to retrieve the key
......@@ -483,6 +530,19 @@ public class IdentifyReactionsMetabolites {
* @return
*/
private MetaboliteMajorLabel selectMetaboliteMajorLabel(String metabolite, Map<MetaboliteMajorLabel, String> ids){
// if(metabolite.equalsIgnoreCase("AMP"))
// System.out.println(metabolite + "\t" + ids);
// if(metabolite.equalsIgnoreCase("Fatty acid"))
// System.out.println(metabolite + "\t" + ids);
// if(metabolite.equalsIgnoreCase("Coenzyme A"))
// System.out.println(metabolite + "\t" + ids);
// if(metabolite.equalsIgnoreCase("acyl-Coenzyme A"))
// System.out.println(metabolite + "\t" + ids);
// if(metabolite.equalsIgnoreCase("Diphosphate"))
// System.out.println(metabolite + "\t" + ids);
// if(metabolite.equalsIgnoreCase("ATP"))
// System.out.println(metabolite + "\t" + ids);
try {
......@@ -495,16 +555,25 @@ public class IdentifyReactionsMetabolites {
forChildsSearch.put(metabolite, entry);
}
else if(ids.containsKey(MetaboliteMajorLabel.EcoCyc)) { //ecocyc after metacyc
if(ids.containsKey(MetaboliteMajorLabel.ModelSeed))
return MetaboliteMajorLabel.ModelSeed;
else if(ids.containsKey(MetaboliteMajorLabel.MetaCyc))
String[] entry = new String[2];
entry[0] = ids.get(MetaboliteMajorLabel.MetaCyc);
entry[1] = MetaboliteMajorLabel.MetaCyc.toString();
forChildsSearch.put(metabolite, entry);
}
if(ids.containsKey(MetaboliteMajorLabel.MetaCyc))
return MetaboliteMajorLabel.MetaCyc;
else if(ids.containsKey(MetaboliteMajorLabel.EcoCyc))
return MetaboliteMajorLabel.EcoCyc;
if(ids.containsKey(MetaboliteMajorLabel.ModelSeed))
return MetaboliteMajorLabel.ModelSeed;
else if(ids.containsKey(MetaboliteMajorLabel.LigandCompound))
return MetaboliteMajorLabel.LigandCompound;
......
......@@ -16,66 +16,152 @@ import tcdb.capsules.ReactionContainer;
public class ReactionsPredictor {
private static final Integer Tmax = 10;
private static final Double ALPHA = 0.3;
private static final Double ALPHA = 0.8;
private static final Double BETA = 0.3;
private static final Integer MINIMUM_HITS = 2;
private static final Double THRESHOLD = 0.2;
private static final Logger logger = LoggerFactory.getLogger(ReactionsPredictor.class);
public static Map<String, Set<String>> getReactionsForGenes(Map<String, GeneContainer> data, Map<String, Set<ReactionContainer>> reactionContainers, Map<String, List<AlignmentCapsule>> blastResults) {
public static Map<String, Set<String>> getReactionsForGenesByEvalue(Map<String, List<AlignmentCapsule>> blastResults, double eValueThreshold) {
Map<String, Set<String>> results = new HashMap<>();
System.out.println("TAMANHO TOTAL: " + data.size());
////////////////////////
// Map<String, List<AlignmentCapsule>> blastResults2 = new HashMap<>();
//
// blastResults2.put("WP_046131684.1 MULTISPECIES: Na+/H+ antiporter subunit D [Bacillus]", blastResults.get("WP_046131684.1 MULTISPECIES: Na+/H+ antiporter subunit D [Bacillus]"));
///////////////////////
for(String key : blastResults.keySet()) {
Set<String> tcNumbers = new HashSet<>();
for(AlignmentCapsule capsule: blastResults.get(key)) {
if(capsule.getEvalue() < eValueThreshold) {
tcNumbers.add(capsule.getTcdbID());
}
}
results.put(key, tcNumbers);
}
// for(String key : results.keySet())
// System.out.println(key + "\t" + results.get(key).size());
return results;
}
public static Map<String, Map<String, Set<String>>> getReactionsForGenesBySimilarities(Map<String, GeneContainer> data, Map<String, Set<String>> reactionsByTcNumber,
Map<String, List<AlignmentCapsule>> blastResults, Map<String, List<String>> mainReactions) {
Map<String, Map<String, Set<String>>> results = new HashMap<>();
logger.debug("Starting reactions to genes association.");
int lastProgress = -1;
int current = 0;
// Map<String, GeneContainer> data2 = new HashMap<>();
//
// data2.put("YP_003535950.1", data.get("YP_003535950.1")); //YP_003535791.1 //YP_003536402.1 -- 3.A.1.7.5
for(String queryAccession : data.keySet()) {
if(reactionContainers.get(queryAccession) != null && blastResults.get(queryAccession) != null) {
System.out.println();
System.out.println("query " + queryAccession);
System.out.println(reactionContainers.get(queryAccession).size());
System.out.println(blastResults.get(queryAccession).size());
System.out.println(data.get(queryAccession).getHomologousGenes());
try {
Map<String, Set<String>> reactions = findTransportReactions(data.get(queryAccession), reactionsByTcNumber, blastResults.get(queryAccession), mainReactions);
try {
Set<String> reactions = findTransportReactions(data.get(queryAccession), reactionContainers.get(queryAccession), blastResults.get(queryAccession));
results.put(queryAccession, reactions);
}
catch (Exception e) {
results.put(queryAccession, reactions);
}
catch (Exception e) {
logger.error("A problem occurred while searching the reactions for gene: {}", queryAccession);
logger.error("A problem occurred while searching the reactions for gene: {}", queryAccession);
System.out.println(reactionContainers.size());
e.printStackTrace();
}
current++;
int progress = (current*100)/data.size();
e.printStackTrace();
}
if(progress > lastProgress){
lastProgress = progress;
System.out.println(progress + " % search complete" );
}
// if(progress == 10) {
//
// System.out.println("breaking...");
//
// break;
//
// }
// if(progress == 10) {
//
// System.out.println("breaking...");
//
// break;
//
// }
current++;
Integer progress = (current*100)/data.size();
if(progress > lastProgress){
lastProgress = progress;
logger.trace(progress.toString().concat(" % search complete"));
}
}
return results;
}
// public static Map<String, Set<String>> getReactionsForGenes2(Map<String, GeneContainer> data, Map<String, Map<String, Set<ReactionContainer>>> reactionContainers,
// Map<String, List<AlignmentCapsule>> blastResults, Map<String, List<String>> mainReactions) {
//
// Map<String, Set<String>> results = new HashMap<>();
//
// logger.debug("Starting reactions to genes association.");
//
// int lastProgress = -1;
// int current = 0;
//
// // Map<String, GeneContainer> data2 = new HashMap<>();
// //
// // data2.put("YP_003535791.1", data.get("YP_003535791.1")); //YP_003535791.1 //YP_003536402.1 -- 3.A.1.7.5
//
// for(String queryAccession : data.keySet()) {
//
// if(reactionContainers.get(queryAccession) != null && blastResults.get(queryAccession) != null) {
//
// System.out.println("Size of containers " + reactionContainers.get(queryAccession).size());
//
// try {
// Set<String> reactions = findTransportReactions(data.get(queryAccession), reactionContainers.get(queryAccession), blastResults.get(queryAccession), mainReactions);
//
// results.put(queryAccession, reactions);
// }
// catch (Exception e) {
//
// logger.error("A problem occurred while searching the reactions for gene: {}", queryAccession);
// System.out.println(reactionContainers.size());
// e.printStackTrace();
// }
//
//
// // if(progress == 10) {
// //
// // System.out.println("breaking...");
// //
// // break;
// //
// // }
// }
//
// current++;
//
// Integer progress = (current*100)/data.size();
//
// if(progress > lastProgress){
//
// lastProgress = progress;
// logger.trace(progress.toString().concat(" % search complete"));
// }
// }
//
// return results;
// }
// /**
// * Find the most frequent type of transport between all hits
// *
......@@ -122,55 +208,103 @@ public class ReactionsPredictor {
/**
* @param geneContainer
*/
private static Set<String> findTransportReactions(GeneContainer geneContainer, Set<ReactionContainer> setReactionContainers, List<AlignmentCapsule> blastResults) {
private static Map<String, Set<String>> findTransportReactions(GeneContainer geneContainer, Map<String, Set<String>> reactionsByTcNumber, List<AlignmentCapsule> blastResults,
Map<String, List<String>> mainReactions) {
Map<String, Set<String>> results = new HashMap<>();
Map<String, Double> reactionsScores = new HashMap<>();
Map<String, Integer> hits = new HashMap<>();
// Set<String> allReactions = new HashSet<>();
double similaritySum = getSimilaritySum(geneContainer.getSimilarities());
Double similaritySum = 0.0;
Set<String> allReactions = getAllReactions(setReactionContainers);
for(AlignmentCapsule capsule : blastResults) {