Commit 8d7a44c1 authored by Davide Lagoa's avatar Davide Lagoa
Browse files

search 'organism' in tcdb entries

parent 65220629
...@@ -34,13 +34,13 @@ public class ReadFastaTcdb { ...@@ -34,13 +34,13 @@ public class ReadFastaTcdb {
* *
* @return * @return
*/ */
public static Set<String> readfasta() { public static Set<String> readTcNumbersFromFasta(boolean useCache) {
try { try {
Map<String, FastaTcdb> fastaMap = new HashMap<>(); Map<String, FastaTcdb> fastaMap = new HashMap<>();
BufferedReader in = getTcdbFasta(); BufferedReader in = getTcdbFasta(useCache);
Set<String> accessions = new HashSet<>(); Set<String> accessions = new HashSet<>();
...@@ -50,10 +50,10 @@ public class ReadFastaTcdb { ...@@ -50,10 +50,10 @@ public class ReadFastaTcdb {
String accession = "", tcNumber = "", organism = "", description = ""; String accession = "", tcNumber = "", organism = "", description = "";
// Map<String, Integer> distributions1 = new TreeMap<>(); // Map<String, Integer> distributions1 = new TreeMap<>();
// Map<String, Integer> distributions2 = new TreeMap<>(); // Map<String, Integer> distributions2 = new TreeMap<>();
// Map<String, Integer> distributions3 = new TreeMap<>(); // Map<String, Integer> distributions3 = new TreeMap<>();
// Map<String, Integer> distributions4 = new TreeMap<>(); // Map<String, Integer> distributions4 = new TreeMap<>();
Map<String, Integer> distributions5 = new TreeMap<>(); Map<String, Integer> distributions5 = new TreeMap<>();
...@@ -113,57 +113,6 @@ public class ReadFastaTcdb { ...@@ -113,57 +113,6 @@ public class ReadFastaTcdb {
distributions5.put(tcNumber, 1); distributions5.put(tcNumber, 1);
} }
// System.out.println("tc--" + tcNumber);
// String[] newTcNumber = tcNumber.split("\\.");
//
// // System.out.println(newTcNumber.length);
//
// String newTc = newTcNumber[0].concat(".").concat(newTcNumber[1]).concat(".").concat(newTcNumber[2]).concat(".").concat(newTcNumber[3]);
//
// if(distributions4.containsKey(newTc)) {
//
// int count = distributions4.get(newTc);
// distributions4.put(newTc, count+1);
// }
//
// else
// distributions4.put(newTc, 1);
//
// newTc = newTcNumber[0].concat(".").concat(newTcNumber[1]).concat(".").concat(newTcNumber[2]);
//
// if(distributions3.containsKey(newTc)) {
//
// int count = distributions3.get(newTc);
// distributions3.put(newTc, count+1);
// }
//
// else
// distributions3.put(newTc, 1);
//
// newTc = newTcNumber[0].concat(".").concat(newTcNumber[1]);
//
// if(distributions2.containsKey(newTc)) {
//
// int count = distributions2.get(newTc);
// distributions2.put(newTc, count+1);
// }
//
// else
// distributions2.put(newTc, 1);
//
// newTc = newTcNumber[0];
//
// if(distributions1.containsKey(newTc)) {
//
// int count = distributions1.get(newTc);
// distributions1.put(newTc, count+1);
// }
//
// else
// distributions1.put(newTc, 1);
description = ""; description = "";
for(int i = 1; i < subSubHeader.length; i++) for(int i = 1; i < subSubHeader.length; i++)
...@@ -180,13 +129,6 @@ public class ReadFastaTcdb { ...@@ -180,13 +129,6 @@ public class ReadFastaTcdb {
fastaMap.put(accession.concat("_").concat(tcNumber), new FastaTcdb(accession, sequence, tcNumber, organism, description)); fastaMap.put(accession.concat("_").concat(tcNumber), new FastaTcdb(accession, sequence, tcNumber, organism, description));
// System.out.println("groups: " + distributions3.size());
//
// for(String key : distributions3.keySet())
// System.out.println(key + "\t" +distributions3.get(key));
System.out.println("Number of different accessions: " + accessions.size());
return distributions5.keySet(); return distributions5.keySet();
} }
catch (IOException e) { catch (IOException e) {
...@@ -203,32 +145,33 @@ public class ReadFastaTcdb { ...@@ -203,32 +145,33 @@ public class ReadFastaTcdb {
* @return * @return
* @throws FileNotFoundException * @throws FileNotFoundException
*/ */
public static BufferedReader getTcdbFasta() throws FileNotFoundException { public static BufferedReader getTcdbFasta(boolean useCache) throws FileNotFoundException {
try { try {
String filePath = path + FilesUtils.generateFileName(fileName, ".txt"); if(useCache) {
String filePath = path + FilesUtils.generateFileName(fileName, ".txt");
OutputStream out = new FileOutputStream(filePath); OutputStream out = new FileOutputStream(filePath);
LinkConnection conn = new LinkConnection(); LinkConnection conn = new LinkConnection();
if(conn.getCodeConnection(TcdbExplorer.TCDB_FASTA_URL) == 200) { if(conn.getCodeConnection(TcdbExplorer.TCDB_FASTA_URL) == 200) {
webPageSaver(conn.getPageOpenStream(), out); webPageSaver(conn.getPageOpenStream(), out);
saveLastKnownVersion(filePath); saveLastKnownVersion(filePath);
return conn.getPage(); return conn.getPage();
}
out.close();
} }
else {
String lastFilePath = getLastKnownVersion();
return new BufferedReader(new FileReader(lastFilePath)); String lastFilePath = getLastKnownVersion();
} return new BufferedReader(new FileReader(lastFilePath));
} }
catch (Exception e) { catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
...@@ -313,6 +256,99 @@ public class ReadFastaTcdb { ...@@ -313,6 +256,99 @@ public class ReadFastaTcdb {
return word; return word;
} }
/**
* Read fasta file and retrieve all information
*
* @return
*/
public static Map<String, FastaTcdb> readfasta(boolean useCache) {
try {
Map<String, FastaTcdb> fastaMap = new HashMap<>();
BufferedReader in = getTcdbFasta(useCache);
Set<String> accessions = new HashSet<>();
String html;
String sequence = "";
String accession = "", tcNumber = "", organism = "", description = "";
boolean firstTime = true;
while ((html = in.readLine()) != null){
Document doc = Jsoup.parse(html);
String text = doc.body().text().trim();
if(text.contains(">")) {
if(!firstTime) {
fastaMap.put(accession.concat("_").concat(tcNumber), new FastaTcdb(accession, sequence, tcNumber, organism, description));
accessions.add(accession);
}
firstTime = false;
String[] header = text.split("\\|");
accession = header[2];
String[] subSubHeader = null;
if(header[3].contains("\\[")) {
String[] subHeader = header[3].split("\\[");
organism = subHeader[1].replaceAll("\\]", "");
subSubHeader = subHeader[0].split("\\s+");
tcNumber = subSubHeader[0];
System.out.println(organism);
}
else {
organism = null;
subSubHeader = header[3].split("\\s+");
tcNumber = subSubHeader[0];
}
description = "";
for(int i = 1; i < subSubHeader.length; i++)
description = description.concat(subSubHeader[i]).concat(" ");
sequence = "";
}
else {
sequence = sequence.concat(text);
}
}
accessions.add(accession);
fastaMap.put(accession.concat("_").concat(tcNumber), new FastaTcdb(accession, sequence, tcNumber, organism, description));
return fastaMap;
}
catch (IOException e) {
e.printStackTrace();
}
return null;
}
} }
......
...@@ -24,31 +24,29 @@ public class Retriever { ...@@ -24,31 +24,29 @@ public class Retriever {
try { try {
@SuppressWarnings("resource") // @SuppressWarnings("resource")
Scanner reader = new Scanner(System.in); // Scanner reader = new Scanner(System.in);
System.out.println("Retrieving TCDB FASTA file..."); System.out.println("Retrieving TCDB FASTA file...");
Set<String> tcNumbers = TcdbExplorer.getTcNumbers(); Set<String> tcNumbers = TcdbExplorer.getTcNumbers(false);
Set<String> toSearch = TcdbExplorer.generateTCsFamily(tcNumbers); Set<String> toSearch = TcdbExplorer.generateTCsFamily(tcNumbers);
// toSearch.add("2.A.75"); System.out.println("Retrieving data from TCDB...");
//// Map<String, TcNumberContainer> data = FindTransporters.getAllTCNumbersInformation(toSearch);
// System.out.println("Retrieving data from TCDB...");
// Map<String, TcNumberContainer> data = FindTransporters.getAllTCNumbersInformation(toSearch);
// Map<String, String> proteinFamilyDescription = TcdbExplorer.getProteinsBelongingToFamilyDescription(tcNumbers); Map<String, String> proteinFamilyDescription = TcdbExplorer.getProteinsBelongingToFamilyDescription(tcNumbers);
//
// FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\proteinFamilyDescriptions.txt", proteinFamilyDescription); FilesUtils.saveMapInFile("C:\\Users\\Davide\\Documents\\InternalDB\\proteinFamilyDescriptions.txt", proteinFamilyDescription);
Map<String, String> proteinFamilyDescription = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\proteinFamilyDescriptions.txt"); // Map<String, String> proteinFamilyDescription = FilesUtils.readMapFromFile("C:\\Users\\Davide\\Documents\\InternalDB\\proteinFamilyDescriptions.txt");
System.out.println("SEARCH COMPLETE..."); System.out.println("SEARCH COMPLETE...");
// //
int n = 1; // int n = 1;
while (n != 99) { // while (n != 99) {
// Map<Integer, ReactionContainer> container = data.get("2.A.1").getReactionsByID(); // Map<Integer, ReactionContainer> container = data.get("2.A.1").getReactionsByID();
// //
...@@ -63,7 +61,7 @@ public class Retriever { ...@@ -63,7 +61,7 @@ public class Retriever {
// ///////////////DESCOMENTAR // ///////////////DESCOMENTAR
Map<String, TcNumberContainer> data = JSONFilesUtils.readJSONExceptionsFile(); //the reader that reads exceptios can also read normal data // Map<String, TcNumberContainer> data = JSONFilesUtils.readJSONExceptionsFile(); //the reader that reads exceptios can also read normal data
data = ProcessCompartments.processCompartments(data); data = ProcessCompartments.processCompartments(data);
...@@ -79,133 +77,9 @@ public class Retriever { ...@@ -79,133 +77,9 @@ public class Retriever {
///////////////DESCOMENTAR ///////////////DESCOMENTAR
// n = 99;
// checkDescriptions(tcdbMetabolites);
// String alias = "solute2";
//
// String originalReaction = "Nucleobase or Solute2 (out) + H+ (out) $IRREV$ Nucleobase or Vitamin (in) + H+ (in)";
//
// if(originalReaction.matches("(?i).*\\s+" + alias + "\\s+.*"))
// System.out.println(true);
//
// System.out.println(originalReaction);
////
// System.out.println(subtext.length);
// if(originalReaction.contains(" and/or ")) {
// System.out.println("yes");
//
// originalReaction = originalReaction.replace(" and/or ", " ");
// }
//
// System.out.println(originalReaction);
// GenerateTransportReactions.getAllPossibleReactions("2e- (in) $IRREV$ 2e- (out)", "2e- (in) ");
//
// String text = "2e- (in) $IRREV$ 2e- (out)";
//
//// String[] text2 = text.split(ReactionContainer.IRREVERSIBLE_TOKEN);
// String[] text2 = text.split("\\$IRREV\\$");
// System.out.println(text2[0]);
// text = text.replaceAll("\\(e.g., in reduced cytochrome in the periplasm\\)", "in");
// System.out.println(text.substring(text.length()-1));
// for(String key : exceptions.keySet())
// data.replace(key, exceptions.get(key));
//
// InfoToFile.writeFile(data);
// System.out.println("FILE COMPLETE...");
// Map<String, TcNumberContainer> file = JSONFilesUtils.readJSONExceptionsFile();
//
// Set<String> tcs = new HashSet<>();
//
//// tcs.add("3.E.2");
// tcs.add("2.A.11");
// tcs.add("3.B.1");
// tcs.add("9.B.39");
// tcs.add("2.A.108");
//// tcs.add("4.A.6");
//// tcs.add("3.D.4");
//// tcs.add("3.D.2");
// tcs.add("3.D.7");
// tcs.add("2.A.23");
//// tcs.add("2.A.12");
//// tcs.add("4.A.7");
// tcs.add("2.A.47");
// tcs.add("2.A.75");
//// tcs.add("9.A.39");
// tcs.add("2.A.51");
//// tcs.add("9.A.8");
//// tcs.add("5.B.1");
//
//// String tc = "3.D.10";
//
//// for(String tc : file.keySet()) {
//
// for(String tc : tcs) {
//
// System.out.println(tc);
//
// Map<Integer, ReactionContainer> reactions = file.get(tc).getReactionsByID();
//
// for(int key : reactions.keySet()) {
//
// ReactionContainer reaction = reactions.get(key);
//
// System.out.println(reaction.getReaction());
//
// TypeOfTransporter type = FindTransporters.findTypeOfTransport2(reaction, tc);
//
// System.out.println(type);
//
// System.out.println();
//
// }
//
// }
// System.out.println("Enter a random number to repeat (100 to repeat data retrieval) or 99 to finish: ");
// n = reader.nextInt(); // }
n = 99; ////////////////ethshghdsrheshsezghzs
}
System.out.println("SHUTDOWN..."); System.out.println("SHUTDOWN...");
...@@ -253,10 +127,6 @@ public class Retriever { ...@@ -253,10 +127,6 @@ public class Retriever {
} }
} }
System.out.println(uniport.size());
System.out.println(symport.size());
System.out.println(antiport.size());
} }
// List<String> metabolitesAux2 = new ArrayList<>(); // List<String> metabolitesAux2 = new ArrayList<>();
......
...@@ -35,11 +35,11 @@ public class TcdbExplorer { ...@@ -35,11 +35,11 @@ public class TcdbExplorer {
* *
* @return * @return
*/ */
public static Set<String> getTcNumbers(){ public static Set<String> getTcNumbers(boolean useCache){
try { try {
return getAllTcNumbers(); return getAllTcNumbers(useCache);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
...@@ -54,9 +54,9 @@ public class TcdbExplorer { ...@@ -54,9 +54,9 @@ public class TcdbExplorer {
* @return * @return
* @throws Exception * @throws Exception
*/ */
private static Set<String> getAllTcNumbers() throws Exception{ private static Set<String> getAllTcNumbers(boolean useCache) throws Exception{
return ReadFastaTcdb.readfasta(); return ReadFastaTcdb.readTcNumbersFromFasta(useCache);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment