Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 990e6ca8 authored by Davide Lagoa's avatar Davide Lagoa
Browse files

refatoring names retriever from biosynth database + validation

parent c82612ac
......@@ -16,8 +16,16 @@
<!-- <url>http://192.168.1.99/nexus/content/groups/public/</url> -->
<url>http://193.137.11.210/nexus/content/groups/public/</url>
</repository>
<repository>
<id>ebi-repo</id>
<name>ebi-repo</name>
<url>http://www.ebi.ac.uk/intact/maven/nexus/content/repositories/ebi-repo/</url>
</repository>
</repositories>
<build>
<plugins>
<!-- <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId>
......@@ -59,6 +67,16 @@
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
</exclusion>
<exclusion>
<groupId>
uk.ac.ebi.chebi.webapps.chebiWS.client
</groupId>
<artifactId>chebiWS-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.axis</groupId>
<artifactId>axis-saaj</artifactId>
</exclusion>
</exclusions>
</dependency>
......@@ -253,5 +271,12 @@
<version>1.2.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/uk.ac.ebi.chebi.webapps.chebiWS.client/chebiWS-client -->
<dependency>
<groupId>uk.ac.ebi.chebi.webapps.chebiWS.client</groupId>
<artifactId>chebiWS-client</artifactId>
<version>2.4</version>
</dependency>
</dependencies>
</project>
package APIs;
import java.util.List;
import pt.uminho.ceb.biosystems.merlin.bioapis.externalAPI.ebi.chebi.ChebiAPIInterface;
import uk.ac.ebi.chebi.webapps.chebiWS.client.ChebiWebServiceClient;
import uk.ac.ebi.chebi.webapps.chebiWS.model.DataItem;
import uk.ac.ebi.chebi.webapps.chebiWS.model.Entity;
import uk.ac.ebi.chebi.webapps.chebiWS.model.LiteEntity;
import uk.ac.ebi.chebi.webapps.chebiWS.model.LiteEntityList;
import uk.ac.ebi.chebi.webapps.chebiWS.model.SearchCategory;
import uk.ac.ebi.chebi.webapps.chebiWS.model.StarsCategory;
public class ChebiAPI extends ChebiAPIInterface{
static private ChebiWebServiceClient chebiClient = new ChebiWebServiceClient();
public static String getMetacycIDUsingExternalReference(String id) {
String identifier = null;
try {
ChebiWebServiceClient chebiClient = new ChebiWebServiceClient();
LiteEntityList entities = chebiClient.getLiteEntity(id, SearchCategory.MANUAL_XREFS, 1, StarsCategory.THREE_ONLY);
List<LiteEntity> resultList = entities.getListElement();
String chebiID = "";
for (LiteEntity liteEntity : resultList ) {
chebiID = liteEntity.getChebiId();
}
if(chebiID != null && !chebiID.isEmpty()) {
Entity entity = chebiClient.getCompleteEntity(chebiID);
List<DataItem> db = entity.getDatabaseLinks();
for ( DataItem dataItem : db ) { // List all synonyms
if(dataItem.getType().trim().equalsIgnoreCase("MetaCyc accession")) {
identifier = dataItem.getData();
break;
}
}
}
}
catch (Exception e1) {
e1.printStackTrace();
}
return identifier;
}
}
......@@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory;
import org.springframework.core.io.FileSystemResource;
import files.FilesUtils;
import internalDB.FetchCompoundsByName;
import internalDB.WriteByMetabolitesID;
import pt.uminho.sysbio.biosynth.integration.GraphMetaboliteEntity;
import pt.uminho.sysbio.biosynth.integration.etl.CentralMetaboliteEtlDataCleansing;
......@@ -164,18 +165,29 @@ public class ModelSEED {
while ((line = reader.readLine()) != null) {
if(!line.isEmpty()) {
if(!line.isEmpty() && !line.startsWith("MS ID")) {
Map<String, Set<String>> submap = new HashMap<>();
String[] text = line.split("\t");
String id = text[0].trim();
String oldID;
String externalID = text[2];
String source = text[3].trim();
if(text.length == 3) {
if(text.length == 3)
externalID = text[1].trim();
else if (!text[1].trim().isEmpty()) {
oldID = text[1].trim();
Integer currentID = FetchCompoundsByName.getIDNumberFormat(id, MetaboliteMajorLabel.ModelSeed);
Integer previousID = FetchCompoundsByName.getIDNumberFormat(oldID, MetaboliteMajorLabel.ModelSeed);
if(previousID < currentID) {
id = oldID;
}
}
MetaboliteMajorLabel label = null;
......@@ -230,6 +242,8 @@ public class ModelSEED {
e.printStackTrace();
}
System.out.println(data);
return data;
}
......
package internalDB;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.ResourceIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Iterators;
import APIs.ChebiAPI;
import containers.BiosynthMetabolites;
import files.FilesUtils;
import pt.uminho.ceb.biosystems.merlin.bioapis.externalAPI.ebi.chebi.ChebiAPIInterface;
import pt.uminho.sysbio.biosynth.integration.io.dao.neo4j.MetaboliteMajorLabel;
import pt.uminho.sysbio.biosynth.integration.io.dao.neo4j.MetabolitePropertyLabel;
import pt.uminho.sysbio.biosynth.integration.neo4j.BiodbMetaboliteNode;
import pt.uminho.sysbio.biosynthframework.BiodbGraphDatabaseService;
import uk.ac.ebi.chebi.webapps.chebiWS.client.ChebiWebServiceClient;
import uk.ac.ebi.chebi.webapps.chebiWS.model.DataItem;
import uk.ac.ebi.chebi.webapps.chebiWS.model.Entity;
import uk.ac.ebi.chebi.webapps.chebiWS.model.LiteEntityList;
import uk.ac.ebi.chebi.webapps.chebiWS.model.SearchCategory;
import uk.ac.ebi.chebi.webapps.chebiWS.model.StarsCategory;
import utilities.FileUtils;
import utilities.triage_utilities.Utilities;
public class FetchCompoundsByName {
public static final String FILE_VERSION = "14.txt";
private static final Set<MetaboliteMajorLabel> DEFAULT_DATABASES = Set.of(MetaboliteMajorLabel.LigandCompound, MetaboliteMajorLabel.LigandGlycan,
MetaboliteMajorLabel.ModelSeed, MetaboliteMajorLabel.MetaCyc, MetaboliteMajorLabel.EcoCyc, MetaboliteMajorLabel.BiGG, MetaboliteMajorLabel.BiGGMetabolite);
private Map<String, Map<MetaboliteMajorLabel, String>> compounds;
private Map<String, String> formulas;
private Map<String, String> forReplacement;
private Map<String, String> namesLowerCaseWithoutSigns; //for later comparison
private Map<String, String> namesWithoutSigns; //for later comparison
private Map<String, String> namesLowerCase; //for later comparison
private BiodbGraphDatabaseService service;
private static final Logger logger = LoggerFactory.getLogger(FetchCompoundsByName.class);
/**
* Method to retrieve biosynts' metabolites by name from all major databases
*
* @param service
* @return
*/
public FetchCompoundsByName(BiodbGraphDatabaseService service, boolean useCache) {
compounds = new HashMap<>();
forReplacement = new HashMap<>();
formulas = new HashMap<>();
namesLowerCaseWithoutSigns = new HashMap<>(); //for later comparison
namesWithoutSigns = new HashMap<>(); //for later comparison
namesLowerCase = new HashMap<>(); //for later comparison
this.service = service;
if(useCache) {
namesLowerCaseWithoutSigns = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns" + FILE_VERSION);
namesWithoutSigns = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns" + FILE_VERSION);
namesLowerCase = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase" + FILE_VERSION);
compounds = FileUtils.readMapFromFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs" + FILE_VERSION);
}
else {
getNamesMethod1();
getNamesMethod2();
// forReplacement.put("cpd27042", "cpd00672");
// forReplacement.put("cpd20862", "cpd00244");
for(String s : compounds.keySet()) {
String id = compounds.get(s).get(MetaboliteMajorLabel.ModelSeed);
if(forReplacement.containsKey(id)) {
compounds.get(s).put(MetaboliteMajorLabel.ModelSeed, forReplacement.get(id));
}
}
compounds.remove("R");
compounds.remove("r");
compounds.remove("P");
compounds.remove("p");
FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\forReplacement.txt", forReplacement);
FileUtils.saveMapInFile2("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getMetabolitesIDs" + FILE_VERSION, compounds);
FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCase" + FILE_VERSION, namesLowerCase);
FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesLowerCaseWithoutSigns" + FILE_VERSION, namesLowerCaseWithoutSigns);
FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\info\\getNamesWithoutSigns" + FILE_VERSION, namesWithoutSigns);
}
}
private void getNamesMethod1() {
// BiodbMetaboliteNode node1 = service.getMetabolite("cpd00027", MetaboliteMajorLabel.ModelSeed);
//
//
//
// System.out.println(node1.getAllProperties());
//
//
// Iterable<RelationshipType> rels1 = node1.getRelationshipTypes();
// Iterable<RelationshipType> rels2 = node1.getRelationshipTypes();
//
// for(RelationshipType r : rels1) {
//
// System.out.println(r.name());
//
// }
//
// Iterable<Relationship> crossref1 = node1.getRelationships(RelationshipType.withName("has_name"));
//
//
// for(Relationship r : crossref1) {
//
// // System.out.println(r.getAllProperties());
// System.out.println(r.getEndNode().getAllProperties());
//
// }
//
// System.out.println();
// System.out.println("##############################");
// System.out.println();
//
// BiodbMetaboliteNode node2 = service.getMetabolite("cpd26821", MetaboliteMajorLabel.ModelSeed);
// System.out.println(node2.getAllProperties());
//
// for(RelationshipType r : rels2) {
//
// System.out.println(r.name());
//
// }
//
// Iterable<Relationship> crossref2 = node2.getRelationships(RelationshipType.withName("has_name"));
//
//
// for(Relationship r : crossref2) {
//
// // System.out.println(r.getAllProperties());
// System.out.println(r.getEndNode().getAllProperties());
//
// }
logger.trace("Searching using method 1");
int lastProgress = -1;
int current = 0;
ResourceIterator<Node> nodes = service.findNodes(MetabolitePropertyLabel.Name);
int dataSize = Iterators.size(nodes);
nodes = service.findNodes(MetabolitePropertyLabel.Name);
while(nodes.hasNext()) {
Map<MetaboliteMajorLabel, Set<Long>> counts = new HashMap<>();
Node node = nodes.next();
if(node.hasProperty("key")) {
// if(node.hasProperty("key") && node.getProperty("key").toString().equals("Mn2+")) {
String name = node.getProperty("key").toString();
for(Relationship rel : node.getRelationships()) {
Node metaboliteNode = rel.getStartNode();
if(metaboliteNode.hasProperty("major_label") && metaboliteNode.hasProperty("entry")) {
MetaboliteMajorLabel label = getMetaboliteLabel(metaboliteNode);
// System.out.println(label + "\t" + metaboliteNode.getAllProperties());
Set<Long> set = new HashSet<>();
if(counts.containsKey(label))
set = counts.get(label);
set.add(metaboliteNode.getId());
counts.put(label, set);
}
}
saveNameAux(name, counts);
}
current++;
Integer progress = (current*100)/dataSize;
if(progress > lastProgress){
lastProgress = progress;
logger.trace(progress.toString().concat(" % search complete"));
}
}
}
private void getNamesMethod2() {
Map<String, Map<MetaboliteMajorLabel, Set<Long>>> namesCounts = new HashMap<>();
Set<BiodbMetaboliteNode> allMetabolites = service.listMetabolites();
logger.trace("Searching using method 2");
int lastProgress = -1;
int current = 0;
for(BiodbMetaboliteNode node : allMetabolites) {
Map<MetaboliteMajorLabel, Set<Long>> counts = new HashMap<>();
if(!node.getEntry().isEmpty()) {
Map<String, Object> nodeProperties = node.getAllProperties();
Set<String> names = getSynonyms(node, nodeProperties, service);
for(String name : names) {
if(!compounds.containsKey(name)) {
if(!namesCounts.containsKey(name))
namesCounts.put(name, new HashMap<>());
counts = namesCounts.get(name);
if(node.hasProperty("major_label")) {
MetaboliteMajorLabel label = getMetaboliteLabel(node);
if(counts.containsKey(label)) {
counts.get(label).add(node.getId());
}
else {
Set<Long> set = new HashSet<>();
set.add(node.getId());
counts.put(label, set);
}
}
namesCounts.put(name, counts);
}
}
}
}
for(String name : namesCounts.keySet()) {
// if(name.equals("Mn2+")) {
saveNameAux(name, namesCounts.get(name));
current++;
Integer progress = (current*100)/namesCounts.size();
if(progress > lastProgress){
lastProgress = progress;
logger.trace(progress.toString().concat(" % search complete"));
}
// }
}
}
private void saveNameAux(String name, Map<MetaboliteMajorLabel, Set<Long>> counts) {
Map<MetaboliteMajorLabel, String> references = new HashMap<>();
// System.out.println(counts);
Long id = selectBestNode(counts);
// id = service.getMetabolite("META:CPD-9956", MetaboliteMajorLabel.MetaCyc).getId();
Node metaboliteNode = null;
if(id != null) {
metaboliteNode = service.getNodeById(id);
String formula = "";
if(metaboliteNode.hasProperty("formula"))
formula = metaboliteNode.getProperty("formula").toString();
Iterable<Relationship> relationships = metaboliteNode.getRelationships(RelationshipType.withName("has_crossreference_to"));
for(Relationship rel : relationships) {
Set<Node> setRelatedNodes = Set.of(rel.getStartNode(), rel.getEndNode());
for(Node relatedNode : setRelatedNodes) {
// System.out.println(relatedNode.getAllProperties());
if(relatedNode.hasProperty("entry")) {
String entryID = relatedNode.getProperty("entry").toString();
MetaboliteMajorLabel label = getMetaboliteLabel(relatedNode);
// System.out.println(label + "\t" + entryID);
String formula2 = "";
if(relatedNode.hasProperty("formula"))
formula2 = relatedNode.getProperty("formula").toString();
if(label.equals(MetaboliteMajorLabel.ModelSeed) && counts.containsKey(label)) {
for(Long previousID : counts.get(label)) {
if(!id.equals(previousID)) {
Node previousNode = service.getNodeById(previousID);
if(previousNode.hasProperty("formula")) {
String previousFormula = previousNode.getProperty("formula").toString();
if(formula2.equals(previousFormula)) {
Integer currentEntryID = FetchCompoundsByName.getIDNumberFormat(entryID, label);
Integer previousEntryID = FetchCompoundsByName.getIDNumberFormat(previousNode.getProperty("entry").toString(), label);
if(currentEntryID > previousEntryID) {
forReplacement.put(entryID, previousNode.getProperty("entry").toString());
}
}
}
}
}
}
if(DEFAULT_DATABASES.contains(label) && saveEntry(entryID, name, formula, relatedNode, label, references, formulas)) {
references.put(label, entryID);
formulas.put(label.toString().concat(entryID), formula2);
}
}
}
}
}
else
logger.warn("Conflicts while searching metabolites for metabolite alias: {}. Consider creating an exception!", name);
if(!references.containsKey(MetaboliteMajorLabel.MetaCyc) && !name.isEmpty() && metaboliteNode != null) {
String metacycEntry = getMetacycEntry(metaboliteNode, counts);
if(metacycEntry != null)
references.put(MetaboliteMajorLabel.MetaCyc, metacycEntry);
// String idForSearch = null;
//
// if(references.containsKey(MetaboliteMajorLabel.LigandCompound))
// idForSearch = references.get(MetaboliteMajorLabel.LigandCompound);
//
// if(idForSearch != null) {
//
// String identifier = ChebiAPI.getMetacycIDUsingExternalReference(idForSearch);
//
// if(identifier != null && !identifier.isEmpty()) {
//
// BiodbMetaboliteNode metaboliteNode = service.getMetabolite("META:".concat(identifier), MetaboliteMajorLabel.MetaCyc);
//
// if(metaboliteNode != null)
// references.put(MetaboliteMajorLabel.MetaCyc, metaboliteNode.getEntry());
// }
// }
}
compounds.put(Utilities.processBiosynthName(name), references);
namesWithoutSigns.put(name, name.replaceAll("[^A-Za-z0-9]", ""));
namesLowerCase.put(name, name.toLowerCase());
namesLowerCaseWithoutSigns.put(name, name.replaceAll("[^A-Za-z0-9]", "").toLowerCase());
}
/**
* @param metaboliteNode
* @param counts
* @return
*/
private String getMetacycEntry(Node metaboliteNode, Map<MetaboliteMajorLabel, Set<Long>> counts) {
MetaboliteMajorLabel label = null;
if(metaboliteNode.hasProperty("major_label"))
label = getMetaboliteLabel(metaboliteNode);
if(counts.containsKey(MetaboliteMajorLabel.MetaCyc)) {
return getMetacycEntryAux(counts);
}
if((label.equals(MetaboliteMajorLabel.LigandCompound) && counts.containsKey(MetaboliteMajorLabel.ModelSeed) ||
label.equals(MetaboliteMajorLabel.ModelSeed) && counts.containsKey(MetaboliteMajorLabel.LigandCompound))
&& !counts.containsKey(MetaboliteMajorLabel.MetaCyc)) {
MetaboliteMajorLabel targetLabel = MetaboliteMajorLabel.ModelSeed;
if(label.equals(MetaboliteMajorLabel.LigandCompound))
targetLabel = MetaboliteMajorLabel.LigandCompound;
String bestEntry = "";
if(counts.get(targetLabel).size() > 1) {
Set<String> referees = new HashSet<>();
Integer min = 99999999;
for(Long l : counts.get(targetLabel)) {