Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 189af95f authored by Davide Lagoa's avatar Davide Lagoa
Browse files

method to find same metabolites by name and formula modified

parent 9f18bd15
......@@ -19,6 +19,7 @@ import containers.TcNumberContainer;
import pt.uminho.sysbio.biosynth.integration.io.dao.neo4j.MetaboliteMajorLabel;
import pt.uminho.sysbio.biosynth.integration.neo4j.BiodbMetaboliteNode;
import pt.uminho.sysbio.biosynthframework.BiodbGraphDatabaseService;
import reactions.IdentifyReactionsMetabolites;
import reactions.ReactionsMetabolites;
import utilities.FileUtils;
import utilities.JSONfiles;
......@@ -46,8 +47,8 @@ public class WriteByMetabolitesID {
System.out.println("Retrieving data...");
// logger.trace("salfnafsn");
Map<String, BiosynthMetaboliteProperties> data = getBiosynthDBData(service);
// Map<String, BiosynthMetaboliteProperties> data = null;
// Map<String, BiosynthMetaboliteProperties> data = getBiosynthDBData(service);
Map<String, BiosynthMetaboliteProperties> data = null;
@SuppressWarnings("resource")
Scanner reader = new Scanner(System.in);
......@@ -82,6 +83,8 @@ public class WriteByMetabolitesID {
n = reader.nextInt();
} catch (Exception e) {
e.printStackTrace();
n = reader.nextInt();
}
}
......@@ -100,41 +103,42 @@ public class WriteByMetabolitesID {
try {
// Map<String, Set<TcNumberContainer>> reactionsData = JSONfiles.readJSONtcdbReactionsFile();
//
Map<String, Set<TcNumberContainer>> reactionsData = JSONfiles.readJSONtcdbReactionsFile();
// Set<String> tcdbMetabolites = ReactionsMetabolites.getMetabolitesFromReactions(reactionsData);
//
// System.out.println("TOTAL FOR SEARCH: " + tcdbMetabolites.size());
//
// Map<String, Map<String, MetaboliteMajorLabel>> tcdbMetabolitesIDs = ReactionsMetabolites.getMetabolitesIDs(tcdbMetabolites, namesAndIDsContainer, service);
//
new IdentifyReactionsMetabolites(reactionsData, data, namesAndIDsContainer, service);
// System.out.println("TOTAL FOUND: " + tcdbMetabolitesIDs.size());
//
//
// FileUtils.saveMapInFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs.txt", namesAndIDsContainer.getMetabolitesIDs());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase.txt", namesAndIDsContainer.getNamesLowerCase());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns.txt", namesAndIDsContainer.getNamesLowerCaseWithoutSigns());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns.txt", namesAndIDsContainer.getNamesWithoutSigns());
// FileUtils.saveMapInFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs2.txt", namesAndIDsContainer.getMetabolitesIDs());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase2.txt", namesAndIDsContainer.getNamesLowerCase());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns2.txt", namesAndIDsContainer.getNamesLowerCaseWithoutSigns());
// FileUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns2.txt", namesAndIDsContainer.getNamesWithoutSigns());
System.out.println(namesAndIDsContainer.getMetabolitesIDs().size());
// System.out.println(namesAndIDsContainer.getMetabolitesIDs().get("Cu2+"));
// if(namesAndIDsContainer.getMetabolitesIDs().containsKey("polyol"))
// System.out.println(namesAndIDsContainer.getMetabolitesIDs());
// if(byName.containsKey("O2"))
// System.out.println(byName.get("O2"));
//
// if(byName.containsKey("oxygen"))
// System.out.println(byName.get("oxygen"));
//
// if(byName.containsKey("Oxygen"))
// System.out.println(byName.get("Oxygen"));
//
// if(byName.containsKey("OXYGEN-MOLECULE"))
// System.out.println(byName.get("OXYGEN-MOLECULE"));
// if(namesAndIDsContainer.getMetabolitesIDs().containsKey("triose phosphate"))
// System.out.println(namesAndIDsContainer.getMetabolitesIDs().get("triose phosphate"));
// else
// System.out.println("nada");
// System.out.println(byName.containsKey("L-val"));
......@@ -208,16 +212,17 @@ public class WriteByMetabolitesID {
// MetabolitesChilds.getMetaboliteChilds("ECOLI:Sugar-alcohols", MetaboliteMajorLabel.MetaCyc, service);
// if(byName.containsKey("Oxygen"))
// System.out.println(byName.get("oxygen"));
//
//
// Node node = service.getNodeByEntryAndLabel("C00007", MetaboliteMajorLabel.LigandCompound);
// Node node = service.getNodeByEntryAndLabel("ECOLI:Sugar-alcohols", MetaboliteMajorLabel.MetaCyc);
// Node node = service.getNodeByEntryAndLabel("C19171", MetaboliteMajorLabel.LigandCompound);
// System.out.println(node.getAllProperties());
// Node node2 = service.getNodeByEntryAndLabel("META:CU+2", MetaboliteMajorLabel.MetaCyc);
// System.out.println(node2.getAllProperties());
//
// String names = (String) node.getAllProperties().get("name");
// String names = (String) node.getAllProperties().get("name");
......@@ -226,11 +231,11 @@ public class WriteByMetabolitesID {
// System.out.println(names);
//
//
// Map<String, Object> properties = node.getAllProperties();
// Map<String, Object> properties2 = node2.getAllProperties();
//
// System.out.println(properties);
// System.out.println(properties2);
// Map<String, Object> properties = node.getAllProperties();
// Map<String, Object> properties2 = node2.getAllProperties();
//
// System.out.println(properties);
// System.out.println(properties2);
//
// System.out.println();
//
......@@ -490,11 +495,11 @@ public class WriteByMetabolitesID {
if(useCache) {
namesLowerCaseWithoutSigns = FileUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns.txt");
namesWithoutSigns = FileUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns.txt");
namesLowerCase = FileUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase.txt");
namesLowerCaseWithoutSigns = FileUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns2.txt");
namesWithoutSigns = FileUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns2.txt");
namesLowerCase = FileUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase2.txt");
compounds = FileUtils.readMapFromFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs.txt");
compounds = FileUtils.readMapFromFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs2.txt");
return new BiosynthMetabolites(compounds, namesLowerCaseWithoutSigns, namesWithoutSigns, namesLowerCase);
}
......@@ -506,18 +511,18 @@ public class WriteByMetabolitesID {
Set<BiodbMetaboliteNode> allMetabolites = service.listMetabolites();
// Set<BiodbMetaboliteNode> allMetabolites = new HashSet<>();
//
// allMetabolites.add(service.getMetabolite("C00007", MetaboliteMajorLabel.LigandCompound));
// allMetabolites.add(service.getMetabolite("o2", MetaboliteMajorLabel.BiGG));
// allMetabolites.add(service.getMetabolite("ECOLI:OXYGEN-MOLECULE", MetaboliteMajorLabel.MetaCyc));
// allMetabolites.add(service.getMetabolite("cpd00007", MetaboliteMajorLabel.ModelSeed));
// allMetabolites.add(service.getMetabolite("C05359", MetaboliteMajorLabel.LigandCompound));
// allMetabolites.add(service.getMetabolite("glu-L", MetaboliteMajorLabel.BiGG));
// allMetabolites.add(service.getMetabolite("META:CU+2", MetaboliteMajorLabel.MetaCyc));
// allMetabolites.add(service.getMetabolite("cpd12713", MetaboliteMajorLabel.ModelSeed));
// allMetabolites.add(service.getMetabolite("glu__L", MetaboliteMajorLabel.BiGGMetabolite));
// Set<String> allDatabases = new HashSet<>();
for(BiodbMetaboliteNode node : allMetabolites) {
String formula = "";
String entryID = node.getEntry();
if(!entryID.isEmpty()) {
......@@ -528,8 +533,8 @@ public class WriteByMetabolitesID {
Set<String> names = getSynonyms(node, nodeProperties, service);
// if(entryID.equals("ECOLI:VAL"))
// System.out.println("found ecoli >>>> " + names);
// if(entryID.equals("ECOLI:VAL"))
// System.out.println("found ecoli >>>> " + names);
if(!names.isEmpty()) {
......@@ -540,15 +545,25 @@ public class WriteByMetabolitesID {
// if(entryID.equals("META:OXYGEN-MOLECULE"))
// System.out.println(formula + "\t" + names);
names.remove("co2"); // to avoid errors with cobalt ions
for(String name : names) {
// System.out.println();
//
// System.out.println(name + " >>>>> " + formula);
for(String metab : formulas.keySet()) {
if(metab.equalsIgnoreCase(name)) {
// System.out.println("EQUALS NAME >>>>> " + metab + "\t" + name );
if(formula.equals(formulas.get(metab))) {
// System.out.println("EQUALS formula >>>>> " + formula + "\t" + formulas.get(metab) );
if(!formulas.containsKey(name))
formulas.put(name, formula);
......@@ -558,6 +573,9 @@ public class WriteByMetabolitesID {
newAlias.addAll(names);
// System.out.println(alias);
// System.out.println(newAlias);
for(String newEntry : newAlias)
alias.put(newEntry, newAlias);
......@@ -570,6 +588,11 @@ public class WriteByMetabolitesID {
for(String name : names) {
// if(name.equals("Cu2+"))
// System.out.println("coiso");
// System.out.println("name>>" + name);
if(!found) {
......@@ -584,6 +607,8 @@ public class WriteByMetabolitesID {
Map<MetaboliteMajorLabel, String> ids = new HashMap<>();
// System.out.println("alias>>> " + alias.get(name));
for(String name2 : alias.get(name)) {
if(compounds.containsKey(name2))
ids.putAll(compounds.get(name2));
......@@ -592,24 +617,35 @@ public class WriteByMetabolitesID {
ids.put(label, entryID);
for(String name2 : alias.get(name)) {
compounds.put(name2, ids);
if(compounds.containsKey(name2)) {
for(MetaboliteMajorLabel key : ids.keySet()) {
if(!compounds.get(name2).containsKey(key))
compounds.get(name2).put(key, ids.get(key));
}
}
else
compounds.put(name2, ids);
namesWithoutSigns.put(name2, name2.replaceAll("[^A-Za-z0-9]", ""));
namesLowerCase.put(name2, name2.toLowerCase());
namesLowerCaseWithoutSigns.put(name2, name2.replaceAll("[^A-Za-z0-9]", "").toLowerCase());
}
}
}
}
}
// System.out.println(compounds.get("Electron"));
// FindMetabolitesID.saveMapInFile2("C:\\Users\\Davide\\Documents\\InternalDB\\DatabaseCompounds.txt", compounds);
// FindMetabolitesID.saveMapInFile2("C:\\Users\\Davide\\Documents\\InternalDB\\filtered.txt", filteredCompounds);
System.out.println("Number of compounds: " + compounds.size());
// System.out.println("Number of compounds: " + compounds.size());
// System.out.println(compounds);
// System.out.println("Number of filteredCompounds: " + filteredCompounds.size());
......@@ -662,6 +698,11 @@ public class WriteByMetabolitesID {
* @return
*/
private static Set<String> getSynonyms(Node node, Map<String, Object> nodeProperties, BiodbGraphDatabaseService service){
// if(node.getAllProperties().get("entry").equals("META:CU+2"))
// System.out.println("AQUIIII");
// else
// System.out.println("not found!!!!!!!!");
String names ="";
Set<String> synonyms = new HashSet<>();
......@@ -701,6 +742,7 @@ public class WriteByMetabolitesID {
}
for(String name : names.split(";")) {
if(!name.isEmpty()) {
synonyms.add(name.replaceAll("<i>", "").replaceAll("</i>", "").replaceAll("\\[", "")
.replaceAll("\\]", "").replaceAll("</sup>", "").replaceAll("<sup>", "").replaceAll("</I>", "").replaceAll("<I>", "")
......
......@@ -14,37 +14,45 @@ import pt.uminho.sysbio.biosynth.integration.io.dao.neo4j.MetaboliteMajorLabel;
import pt.uminho.sysbio.biosynthframework.BiodbGraphDatabaseService;
public class IdentifyReactionsMetabolites {
public static final String[] REGEX_STOICHIOMETRY = {"^(\\s*n\\s*)", "^(\\d+\\/*\\d*\\s*)"};
public static final String[] REGEX_NAMES = {"ic acids*"};
public static final String[] REPLACEMENT_NAME = {"ate"};
public static final String s = "s";
private BiosynthMetabolites namesAndIDsContainer;
private Map<String, Map<MetaboliteMajorLabel, String>> allMetabolitesByName;
private Map<String, BiosynthMetaboliteProperties> data;
private BiodbGraphDatabaseService service;
private Map<String, Map<String, MetaboliteMajorLabel>> tcdbMetabolitesIDs;
private Set<String> metabolites;
Synonyms dictionary;
public IdentifyReactionsMetabolites(Map<String, Set<TcNumberContainer>> reactionsData, Map<String, BiosynthMetaboliteProperties> data, BiosynthMetabolites namesAndIDsContainer, BiodbGraphDatabaseService service) {
this.service = service;
this.data = data;
this.allMetabolitesByName = new HashMap<>(namesAndIDsContainer.getMetabolitesIDs());
this.namesAndIDsContainer = namesAndIDsContainer;
this.dictionary = new Synonyms();
Set<String> tcdbMetabolites = getMetabolitesFromReactions(reactionsData);
System.out.println("TOTAL FOR SEARCH: " + tcdbMetabolites.size());
getMetabolitesIDs(tcdbMetabolites, namesAndIDsContainer, service);
System.out.println("TOTAL FOUND: " + tcdbMetabolitesIDs.size()); //1103
// for(String metabolite : metabolites)
// System.out.println(metabolite);
//
// System.out.println(metabolites.size());
}
/**
* get ids for metabolites that are present in tcdb's reactions
*
......@@ -62,57 +70,56 @@ public class IdentifyReactionsMetabolites {
namesAndIDsContainer = standardizationOfNames2(namesAndIDsContainer);
System.out.println("metabolites >>>" + metabolites.size());
// System.out.println("MET>>>>> " + allMetabolitesByName.get("Electron"));
System.out.println("metabolites >>>" + metabolites.size()); //2078
System.out.println("allmetabolites >>>" + namesAndIDsContainer.getMetabolitesIDs().size()); //154224
System.out.println("allmetabolites >>>" + namesAndIDsContainer.getMetabolitesIDs().size());
identificationByDirectMatch();
System.out.println("FOUND1: " + tcdbMetabolitesIDs.size());
System.out.println("FOUND1: " + tcdbMetabolitesIDs.size()); //911
identificationDeletingStoichiometry();
System.out.println("FOUND2: " + tcdbMetabolitesIDs.size());
System.out.println("FOUND2: " + tcdbMetabolitesIDs.size()); //975
identificationInLowerCase();
System.out.println("FOUND3: " + tcdbMetabolitesIDs.size());
System.out.println("FOUND3: " + tcdbMetabolitesIDs.size()); //1089
identificationReplacingNonAlphanumeric();
System.out.println("FOUND4: " + tcdbMetabolitesIDs.size());
System.out.println("FOUND4: " + tcdbMetabolitesIDs.size()); //1095
identificationIntroducingDandL();
System.out.println("FOUND5: " + tcdbMetabolitesIDs.size());
System.out.println("FOUND5: " + tcdbMetabolitesIDs.size()); //1103
identificationReplacingNonAlphanumericAndInLowercase();
System.out.println("FOUND6: " + tcdbMetabolitesIDs.size());
System.out.println("FOUND6: " + tcdbMetabolitesIDs.size()); //1109
return tcdbMetabolitesIDs;
}
private void saveMetabolite(String metabolite, Map<MetaboliteMajorLabel, String> ids) {
MetaboliteMajorLabel id = selectMetaboliteMajorLabel(ids);
MetaboliteMajorLabel id = selectMetaboliteMajorLabel(metabolite, ids);
if(id != null) {
Map<String, MetaboliteMajorLabel> map = new HashMap<>();
......@@ -124,43 +131,55 @@ public class IdentifyReactionsMetabolites {
metabolites.remove(metabolite);
}
}
/**
* Identification of metabolites introducing D- and L- at the beginning.
*/
private void identificationIntroducingDandL() {
for(String metabolite : new HashSet<>(metabolites)) {
if(namesAndIDsContainer.getNamesWithoutSigns().containsKey("D-"+metabolite)) {
if(metabolite.matches("^(D*L*-+).+")){
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get("D-"+metabolite);
if(namesAndIDsContainer.getNamesWithoutSigns().containsKey(metabolite.replaceAll("^(D*L*-+)", ""))) {
saveMetabolite(metabolite, ids);
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get(metabolite.replaceAll("^(D*L*-+)", ""));
saveMetabolite(metabolite, ids);
}
}
if(namesAndIDsContainer.getNamesWithoutSigns().containsKey("L-"+metabolite)) {
else {
if(namesAndIDsContainer.getNamesWithoutSigns().containsKey("D-"+metabolite)) {
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get("L-"+metabolite);
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get("D-"+metabolite);
saveMetabolite(metabolite, ids);
saveMetabolite(metabolite, ids);
}
if(namesAndIDsContainer.getNamesWithoutSigns().containsKey("L-"+metabolite)) {
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get("L-"+metabolite);
saveMetabolite(metabolite, ids);
}
}
}
}
/**
* Identification of metabolites replacing non-alphanumeric characters and in lowercase.
*/
private void identificationReplacingNonAlphanumericAndInLowercase() {
for(String metabolite : new HashSet<>(metabolites)) {
if(namesAndIDsContainer.getNamesWithoutSigns().containsKey(metabolite.replaceAll("[^A-Za-z0-9]", "").toLowerCase())) {
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get(metabolite.replaceAll("[^A-Za-z0-9]", "").toLowerCase());
// System.out.println(metabolite + ids);
saveMetabolite(metabolite, ids);
}
else if(String.valueOf(metabolite.charAt(metabolite.length()-1)).equals(s)) {
......@@ -168,23 +187,28 @@ public class IdentifyReactionsMetabolites {
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get(metabolite.substring(0, metabolite.length()-1).replaceAll("[^A-Za-z0-9]", ""));
// System.out.println(metabolite + ids);
saveMetabolite(metabolite, ids);
}
}
}
}
/**
* Identification of metabolites replacing non-alphanumeric characters.
*/
private void identificationReplacingNonAlphanumeric() {
for(String metabolite : new HashSet<>(metabolites)) {
if(namesAndIDsContainer.getNamesWithoutSigns().containsKey(metabolite.replaceAll("[^A-Za-z0-9]", ""))) {
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get(metabolite.replaceAll("[^A-Za-z0-9]", ""));
// System.out.println(metabolite + ids);
saveMetabolite(metabolite, ids);
}
else if(String.valueOf(metabolite.charAt(metabolite.length()-1)).equals(s)) {
......@@ -192,23 +216,27 @@ public class IdentifyReactionsMetabolites {
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get(metabolite.substring(0, metabolite.length()-1).replaceAll("[^A-Za-z0-9]", ""));
// System.out.println(metabolite + ids);
saveMetabolite(metabolite, ids);
}
}
}
}
/**
* Identification of metabolites using match in lower case.
*/
private void identificationInLowerCase() {
for(String metabolite : new HashSet<>(metabolites)) {
if(namesAndIDsContainer.getNamesLowerCase().containsKey(metabolite.toLowerCase())) {
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get(metabolite.toLowerCase());
// System.out.println(metabolite + ids);
saveMetabolite(metabolite, ids);
}
else if(String.valueOf(metabolite.charAt(metabolite.length()-1)).equals(s)) {
......@@ -217,40 +245,45 @@ public class IdentifyReactionsMetabolites {
Map<MetaboliteMajorLabel, String> ids = allMetabolitesByName.get(metabolite.substring(0, metabolite.length()-1).toLowerCase());
// System.out.println(metabolite + ids);
saveMetabolite(metabolite, ids);
}
}
}
}
/**
* Method to identify the metabolite by deleting the stoichiometry value (if present), and by deleting a last letter 's' if present too.
*/
private void identificationDeletingStoichiometry() {
for(String metabolite : new HashSet<>(metabolites)) {
identificationDeletingStoichiometryAux(metabolite, false);
if(metabolites.contains(metabolite)) {