Git Lab CI for docker build enabled! You can enable it using .gitlab-ci.yml in your project. Check file template at https://gitlab.bio.di.uminho.pt/snippets/5

Commit 8d7a44c1 authored by Davide Lagoa's avatar Davide Lagoa
Browse files

search 'organism' in tcdb entries

parent 65220629
......@@ -34,13 +34,13 @@ public class ReadFastaTcdb {
*
* @return
*/
public static Set<String> readfasta() {
public static Set<String> readTcNumbersFromFasta(boolean useCache) {
try {
Map<String, FastaTcdb> fastaMap = new HashMap<>();
BufferedReader in = getTcdbFasta();
BufferedReader in = getTcdbFasta(useCache);
Set<String> accessions = new HashSet<>();
......@@ -50,10 +50,10 @@ public class ReadFastaTcdb {
String accession = "", tcNumber = "", organism = "", description = "";
// Map<String, Integer> distributions1 = new TreeMap<>();
// Map<String, Integer> distributions2 = new TreeMap<>();
// Map<String, Integer> distributions3 = new TreeMap<>();
// Map<String, Integer> distributions4 = new TreeMap<>();
// Map<String, Integer> distributions1 = new TreeMap<>();
// Map<String, Integer> distributions2 = new TreeMap<>();
// Map<String, Integer> distributions3 = new TreeMap<>();
// Map<String, Integer> distributions4 = new TreeMap<>();
Map<String, Integer> distributions5 = new TreeMap<>();
......@@ -113,57 +113,6 @@ public class ReadFastaTcdb {
distributions5.put(tcNumber, 1);
}
// System.out.println("tc--" + tcNumber);
// String[] newTcNumber = tcNumber.split("\\.");
//
// // System.out.println(newTcNumber.length);
//
// String newTc = newTcNumber[0].concat(".").concat(newTcNumber[1]).concat(".").concat(newTcNumber[2]).concat(".").concat(newTcNumber[3]);
//
// if(distributions4.containsKey(newTc)) {
//
// int count = distributions4.get(newTc);
// distributions4.put(newTc, count+1);
// }
//
// else
// distributions4.put(newTc, 1);
//
// newTc = newTcNumber[0].concat(".").concat(newTcNumber[1]).concat(".").concat(newTcNumber[2]);
//
// if(distributions3.containsKey(newTc)) {
//
// int count = distributions3.get(newTc);
// distributions3.put(newTc, count+1);
// }
//
// else
// distributions3.put(newTc, 1);
//
// newTc = newTcNumber[0].concat(".").concat(newTcNumber[1]);
//
// if(distributions2.containsKey(newTc)) {
//
// int count = distributions2.get(newTc);
// distributions2.put(newTc, count+1);
// }
//
// else
// distributions2.put(newTc, 1);
//
// newTc = newTcNumber[0];
//
// if(distributions1.containsKey(newTc)) {
//
// int count = distributions1.get(newTc);
// distributions1.put(newTc, count+1);
// }
//
// else
// distributions1.put(newTc, 1);
description = "";
for(int i = 1; i < subSubHeader.length; i++)
......@@ -180,13 +129,6 @@ public class ReadFastaTcdb {
fastaMap.put(accession.concat("_").concat(tcNumber), new FastaTcdb(accession, sequence, tcNumber, organism, description));
// System.out.println("groups: " + distributions3.size());
//
// for(String key : distributions3.keySet())
// System.out.println(key + "\t" +distributions3.get(key));
System.out.println("Number of different accessions: " + accessions.size());
return distributions5.keySet();
}
catch (IOException e) {
......@@ -203,32 +145,33 @@ public class ReadFastaTcdb {
* @return
* @throws FileNotFoundException
*/
public static BufferedReader getTcdbFasta() throws FileNotFoundException {
public static BufferedReader getTcdbFasta(boolean useCache) throws FileNotFoundException {
try {
String filePath = path + FilesUtils.generateFileName(fileName, ".txt");
if(useCache) {
String filePath = path + FilesUtils.generateFileName(fileName, ".txt");
OutputStream out = new FileOutputStream(filePath);
OutputStream out = new FileOutputStream(filePath);
LinkConnection conn = new LinkConnection();
LinkConnection conn = new LinkConnection();
if(conn.getCodeConnection(TcdbExplorer.TCDB_FASTA_URL) == 200) {
if(conn.getCodeConnection(TcdbExplorer.TCDB_FASTA_URL) == 200) {
webPageSaver(conn.getPageOpenStream(), out);
webPageSaver(conn.getPageOpenStream(), out);
saveLastKnownVersion(filePath);
saveLastKnownVersion(filePath);
return conn.getPage();
return conn.getPage();
}
out.close();
}
else {
String lastFilePath = getLastKnownVersion();
return new BufferedReader(new FileReader(lastFilePath));
String lastFilePath = getLastKnownVersion();
}
return new BufferedReader(new FileReader(lastFilePath));
}
catch (Exception e) {
e.printStackTrace();
......@@ -313,6 +256,99 @@ public class ReadFastaTcdb {
return word;
}
/**
* Read fasta file and retrieve all information
*
* @return
*/
public static Map<String, FastaTcdb> readfasta(boolean useCache) {
try {
Map<String, FastaTcdb> fastaMap = new HashMap<>();
BufferedReader in = getTcdbFasta(useCache);
Set<String> accessions = new HashSet<>();
String html;
String sequence = "";
String accession = "", tcNumber = "", organism = "", description = "";
boolean firstTime = true;
while ((html = in.readLine()) != null){
Document doc = Jsoup.parse(html);
String text = doc.body().text().trim();
if(text.contains(">")) {
if(!firstTime) {
fastaMap.put(accession.concat("_").concat(tcNumber), new FastaTcdb(accession, sequence, tcNumber, organism, description));
accessions.add(accession);
}
firstTime = false;
String[] header = text.split("\\|");
accession = header[2];
String[] subSubHeader = null;
if(header[3].contains("\\[")) {
String[] subHeader = header[3].split("\\[");
organism = subHeader[1].replaceAll("\\]", "");
subSubHeader = subHeader[0].split("\\s+");
tcNumber = subSubHeader[0];
System.out.println(organism);
}
else {
organism = null;
subSubHeader = header[3].split("\\s+");
tcNumber = subSubHeader[0];
}
description = "";
for(int i = 1; i < subSubHeader.length; i++)
description = description.concat(subSubHeader[i]).concat(" ");
sequence = "";
}
else {
sequence = sequence.concat(text);
}
}
accessions.add(accession);
fastaMap.put(accession.concat("_").concat(tcNumber), new FastaTcdb(accession, sequence, tcNumber, organism, description));
return fastaMap;
}
catch (IOException e) {
e.printStackTrace();
}
return null;
}
}
......
......@@ -24,31 +24,29 @@ public class Retriever {
try {
@SuppressWarnings("resource")
Scanner reader = new Scanner(System.in);
// @SuppressWarnings("resource")
// Scanner reader = new Scanner(System.in);
System.out.println("Retrieving TCDB FASTA file...");
Set<String> tcNumbers = TcdbExplorer.getTcNumbers();
Set<String> tcNumbers = TcdbExplorer.getTcNumbers(false);
Set<String> toSearch = TcdbExplorer.generateTCsFamily(tcNumbers);
// toSearch.add("2.A.75");
////
// System.out.println("Retrieving data from TCDB...");
// Map<String, TcNumberContainer> data = FindTransporters.getAllTCNumbersInformation(toSearch);
System.out.println("Retrieving data from TCDB...");
Map<String, TcNumberContainer> data = FindTransporters.getAllTCNumbersInformation(toSearch);
// Map<String, String> proteinFamilyDescription = TcdbExplorer.getProteinsBelongingToFamilyDescription(tcNumbers);
//
// FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\proteinFamilyDescriptions.txt", proteinFamilyDescription);
Map<String, String> proteinFamilyDescription = TcdbExplorer.getProteinsBelongingToFamilyDescription(tcNumbers);
FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\proteinFamilyDescriptions.txt", proteinFamilyDescription);
Map<String, String> proteinFamilyDescription = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\proteinFamilyDescriptions.txt");
// Map<String, String> proteinFamilyDescription = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\proteinFamilyDescriptions.txt");
System.out.println("SEARCH COMPLETE...");
//
int n = 1;
// int n = 1;
while (n != 99) {
// while (n != 99) {
// Map<Integer, ReactionContainer> container = data.get("2.A.1").getReactionsByID();
//
......@@ -63,7 +61,7 @@ public class Retriever {
// ///////////////DESCOMENTAR
Map<String, TcNumberContainer> data = JSONFilesUtils.readJSONExceptionsFile(); //the reader that reads exceptios can also read normal data
// Map<String, TcNumberContainer> data = JSONFilesUtils.readJSONExceptionsFile(); //the reader that reads exceptios can also read normal data
data = ProcessCompartments.processCompartments(data);
......@@ -79,133 +77,9 @@ public class Retriever {
///////////////DESCOMENTAR
// checkDescriptions(tcdbMetabolites);
// String alias = "solute2";
//
// String originalReaction = "Nucleobase or Solute2 (out) + H+ (out) $IRREV$ Nucleobase or Vitamin (in) + H+ (in)";
//
// if(originalReaction.matches("(?i).*\\s+" + alias + "\\s+.*"))
// System.out.println(true);
//
// System.out.println(originalReaction);
////
// System.out.println(subtext.length);
// if(originalReaction.contains(" and/or ")) {
// System.out.println("yes");
//
// originalReaction = originalReaction.replace(" and/or ", " ");
// }
//
// System.out.println(originalReaction);
// GenerateTransportReactions.getAllPossibleReactions("2e- (in) $IRREV$ 2e- (out)", "2e- (in) ");
//
// String text = "2e- (in) $IRREV$ 2e- (out)";
//
//// String[] text2 = text.split(ReactionContainer.IRREVERSIBLE_TOKEN);
// String[] text2 = text.split("\\$IRREV\\$");
// System.out.println(text2[0]);
// text = text.replaceAll("\\(e.g., in reduced cytochrome in the periplasm\\)", "in");
// System.out.println(text.substring(text.length()-1));
// for(String key : exceptions.keySet())
// data.replace(key, exceptions.get(key));
//
// InfoToFile.writeFile(data);
// System.out.println("FILE COMPLETE...");
// Map<String, TcNumberContainer> file = JSONFilesUtils.readJSONExceptionsFile();
//
// Set<String> tcs = new HashSet<>();
//
//// tcs.add("3.E.2");
// tcs.add("2.A.11");
// tcs.add("3.B.1");
// tcs.add("9.B.39");
// tcs.add("2.A.108");
//// tcs.add("4.A.6");
//// tcs.add("3.D.4");
//// tcs.add("3.D.2");
// tcs.add("3.D.7");
// tcs.add("2.A.23");
//// tcs.add("2.A.12");
//// tcs.add("4.A.7");
// tcs.add("2.A.47");
// tcs.add("2.A.75");
//// tcs.add("9.A.39");
// tcs.add("2.A.51");
//// tcs.add("9.A.8");
//// tcs.add("5.B.1");
//
//// String tc = "3.D.10";
//
//// for(String tc : file.keySet()) {
//
// for(String tc : tcs) {
//
// System.out.println(tc);
//
// Map<Integer, ReactionContainer> reactions = file.get(tc).getReactionsByID();
//
// for(int key : reactions.keySet()) {
//
// ReactionContainer reaction = reactions.get(key);
//
// System.out.println(reaction.getReaction());
//
// TypeOfTransporter type = FindTransporters.findTypeOfTransport2(reaction, tc);
//
// System.out.println(type);
//
// System.out.println();
//
// }
//
// }
// System.out.println("Enter a random number to repeat (100 to repeat data retrieval) or 99 to finish: ");
// n = 99;
// n = reader.nextInt();
n = 99; ////////////////ethshghdsrheshsezghzs
}
// }
System.out.println("SHUTDOWN...");
......@@ -253,10 +127,6 @@ public class Retriever {
}
}
System.out.println(uniport.size());
System.out.println(symport.size());
System.out.println(antiport.size());
}
// List<String> metabolitesAux2 = new ArrayList<>();
......
......@@ -35,11 +35,11 @@ public class TcdbExplorer {
*
* @return
*/
public static Set<String> getTcNumbers(){
public static Set<String> getTcNumbers(boolean useCache){
try {
return getAllTcNumbers();
return getAllTcNumbers(useCache);
} catch (Exception e) {
e.printStackTrace();
......@@ -54,9 +54,9 @@ public class TcdbExplorer {
* @return
* @throws Exception
*/
private static Set<String> getAllTcNumbers() throws Exception{
private static Set<String> getAllTcNumbers(boolean useCache) throws Exception{
return ReadFastaTcdb.readfasta();
return ReadFastaTcdb.readTcNumbersFromFasta(useCache);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment