Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
TranSyT
transyt-scraper
Commits
c99b7898
Commit
c99b7898
authored
Jun 12, 2020
by
Davide Lagoa
Browse files
kegg api created to complete PTS reactions GPR
parent
514c6f73
Changes
6
Hide whitespace changes
Inline
Side-by-side
src/main/java/pt/uminho/ceb/biosystems/transyt/scraper/APIs/KeggAPI.java
0 → 100644
View file @
c99b7898
package
pt.uminho.ceb.biosystems.transyt.scraper.APIs
;
import
java.io.BufferedReader
;
import
java.io.BufferedWriter
;
import
java.io.File
;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.time.Duration
;
import
java.time.Instant
;
import
java.util.HashMap
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Set
;
import
java.util.concurrent.TimeUnit
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
pt.uminho.ceb.biosystems.transyt.utilities.connection.LinkConnection
;
import
pt.uminho.ceb.biosystems.transyt.utilities.files.FilesUtils
;
public
class
KeggAPI
{
public
static
final
String
PATH_LAST_KNOWN_VERSION
=
FilesUtils
.
getKeggFastaDirectory
().
concat
(
"tcdbLastKnownVersion.log"
);
private
static
final
String
KEGG_FASTA_NAME
=
"keggFasta"
;
private
static
final
Map
<
String
,
String
>
KOs
=
FilesUtils
.
readMapFromFile
(
FilesUtils
.
getKOsToSearchFilePath
());
public
static
final
String
BASE_URL
=
"http://rest.kegg.jp/get/"
;
public
static
final
int
LIMIT
=
5
;
public
static
final
int
BATCH_SIZE
=
10
;
//KEGG is limited to 10 items per request
public
static
final
int
DEFAULT_DELAY_MILLIS
=
800
;
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
KeggAPI
.
class
);
public
static
Map
<
String
,
String
>
searchKeegPTSGenesAndBuildFastaFiles
(){
Map
<
String
,
String
>
results
=
new
HashMap
<>();
for
(
String
ko
:
KOs
.
keySet
())
{
logger
.
info
(
"Searching KO: "
+
ko
);
try
{
LinkConnection
connection
=
getKOInfo
(
ko
);
Map
<
String
,
Set
<
String
>>
genes
=
scrapeGenesFromKoInfo
(
connection
.
getPage
());
Set
<
String
>
queries
=
generateBatchesDistribution
(
genes
);
Map
<
String
,
String
>
sequences
=
getGenesProteinSequence
(
queries
,
ko
);
results
.
putAll
(
sequences
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"An error occurred retrievig KO data: "
+
ko
);
e
.
printStackTrace
();
}
}
saveResults
(
results
);
return
results
;
}
public
static
LinkConnection
getKOInfo
(
String
ko
)
throws
InterruptedException
{
int
attempt
=
0
;
while
(
attempt
<
LIMIT
){
try
{
String
link
=
BASE_URL
+
"ko:"
+
ko
;
LinkConnection
conn
=
new
LinkConnection
();
int
code
=
conn
.
getCodeConnection
(
link
);
if
(
code
==
200
){
return
conn
;
}
else
{
System
.
out
.
println
(
link
);
System
.
out
.
println
(
code
);
attempt
++;
logger
.
warn
(
"Retrying connection... Attempt nr: {}"
,
attempt
);
TimeUnit
.
SECONDS
.
sleep
(
30
);
}
}
catch
(
ArrayIndexOutOfBoundsException
e1
)
{
attempt
=
LIMIT
;
e1
.
printStackTrace
();
logger
.
error
(
"An error occurred while retrieving entry {}"
,
ko
);
logger
.
trace
(
"StrackTrace: {}"
,
e1
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
attempt
++;
logger
.
warn
(
"Retrying connection... Attempt nr: {}"
,
attempt
);
TimeUnit
.
SECONDS
.
sleep
(
30
);
logger
.
trace
(
"StrackTrace: {}"
,
e
);
}
}
return
null
;
}
/**
* Method that searches the AA sequences of each entry
*
* @param queries
* @return
* @throws InterruptedException
*/
public
static
Map
<
String
,
String
>
getGenesProteinSequence
(
Set
<
String
>
queries
,
String
reference
)
throws
InterruptedException
{
Map
<
String
,
String
>
results
=
new
HashMap
<>();
Set
<
String
>
searched
=
new
HashSet
<>();
Set
<
String
>
failed
=
new
HashSet
<>();
logger
.
info
(
"Searching protein sequences for each KEGG gene..."
);
boolean
continueSearch
=
true
;
while
(
continueSearch
)
{
int
lastProgress
=
-
1
;
searched
=
new
HashSet
<>();
for
(
String
query
:
queries
){
boolean
found
=
false
;
int
attempt
=
0
;
while
(
attempt
<
LIMIT
&&
!
found
){
Instant
start
=
Instant
.
now
();
try
{
String
link
=
BASE_URL
+
query
;
LinkConnection
conn
=
new
LinkConnection
();
int
code
=
conn
.
getCodeConnection
(
link
);
if
(
code
==
200
){
results
.
putAll
(
scrapeGenesSequences
(
conn
.
getPage
(),
reference
));
searched
.
add
(
query
);
found
=
true
;
int
progress
=
((
searched
.
size
()+
failed
.
size
())*
100
)/
queries
.
size
();
if
(
progress
>
lastProgress
){
lastProgress
=
progress
;
String
message
=
progress
+
" % search complete"
;
logger
.
info
(
message
);
}
applyWait
(
start
);
}
else
{
System
.
out
.
println
(
link
);
System
.
out
.
println
(
code
);
attempt
++;
logger
.
warn
(
"Retrying connection... Attempt nr: {}"
,
attempt
);
TimeUnit
.
SECONDS
.
sleep
(
30
);
}
}
catch
(
ArrayIndexOutOfBoundsException
e1
)
{
attempt
=
LIMIT
;
e1
.
printStackTrace
();
logger
.
error
(
"An error occurred while retrieving entry {}"
,
query
);
logger
.
trace
(
"StrackTrace: {}"
,
e1
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
attempt
++;
logger
.
warn
(
"Retrying connection... Attempt nr: {}"
,
attempt
);
TimeUnit
.
SECONDS
.
sleep
(
30
);
logger
.
trace
(
"StrackTrace: {}"
,
e
);
}
}
}
continueSearch
=
false
;
}
return
results
;
}
/**
* Method to retrieve genes of the EC info page
*
* @param in
* @return
* @throws IOException
*/
public
static
Map
<
String
,
String
>
scrapeGenesSequences
(
BufferedReader
in
,
String
reference
)
throws
IOException
{
Map
<
String
,
String
>
res
=
new
HashMap
<>();
String
html
;
boolean
read
=
false
;
String
entry
=
null
;
String
organism
=
null
;
String
sequence
=
""
;
while
((
html
=
in
.
readLine
())
!=
null
){
try
{
Document
doc
=
Jsoup
.
parse
(
html
);
String
text
=
doc
.
body
().
text
().
trim
();
if
(
text
.
matches
(
"^ENTRY\\s+.*"
))
{
entry
=
text
.
split
(
"\\s+"
)[
1
].
trim
();
}
else
if
(
text
.
matches
(
"^ORGANISM\\s+.*"
))
{
organism
=
text
.
split
(
"\\s+"
)[
1
].
trim
();
}
else
if
(
text
.
matches
(
"^AASEQ\\s+.*"
))
{
read
=
true
;
}
else
if
(
text
.
matches
(
"^NTSEQ\\s+.*"
))
{
read
=
false
;
String
id
=
"kegg|"
+
reference
+
"|"
+
organism
+
"_"
+
entry
;
if
(
KOs
.
containsKey
(
reference
))
id
=
id
.
concat
(
"|"
+
KOs
.
get
(
reference
));
res
.
put
(
id
,
sequence
.
replaceAll
(
"\n"
,
""
));
sequence
=
""
;
}
if
(
read
&&
!
text
.
matches
(
"^AASEQ\\s+.*"
))
{
sequence
=
sequence
.
concat
(
text
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
return
res
;
}
/**
* Method to retrieve genes of the EC info page
*
* @param in
* @return
* @throws IOException
*/
public
static
Map
<
String
,
Set
<
String
>>
scrapeGenesFromKoInfo
(
BufferedReader
in
)
throws
IOException
{
Map
<
String
,
Set
<
String
>>
res
=
new
HashMap
<>();
String
html
;
boolean
read
=
false
;
while
((
html
=
in
.
readLine
())
!=
null
){
try
{
Document
doc
=
Jsoup
.
parse
(
html
);
String
text
=
doc
.
body
().
text
().
trim
();
if
(
text
.
matches
(
"^GENES\\s+.*"
))
{
read
=
true
;
text
.
replaceAll
(
"^GENES"
,
""
);
}
else
if
(
text
.
matches
(
"^REFERENCE\\s+.*"
))
{
read
=
false
;
}
if
(
read
)
{
String
[]
line
=
text
.
split
(
"\\s+"
);
String
db
=
null
;
for
(
int
i
=
0
;
i
<
line
.
length
;
i
++)
{
if
(
i
==
0
)
{
db
=
line
[
0
].
trim
().
toLowerCase
();
if
(!
res
.
containsKey
(
db
))
{
res
.
put
(
db
,
new
HashSet
<>());
}
}
else
{
Set
<
String
>
genes
=
res
.
get
(
db
);
genes
.
add
(
line
[
i
].
split
(
"\\("
)[
0
].
trim
());
res
.
put
(
db
,
genes
);
}
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
return
res
;
}
public
static
Set
<
String
>
generateBatchesDistribution
(
Map
<
String
,
Set
<
String
>>
toSearch
){
Set
<
String
>
queries
=
new
HashSet
<>();
String
q
=
""
;
int
i
=
0
;
for
(
String
db
:
toSearch
.
keySet
())
{
Set
<
String
>
genes
=
toSearch
.
get
(
db
);
for
(
String
gene
:
genes
)
{
if
(!
q
.
isEmpty
())
q
=
q
.
concat
(
"+"
);
q
=
q
.
concat
(
db
+
gene
);
i
++;
if
(
i
==
BATCH_SIZE
)
{
queries
.
add
(
q
.
replaceAll
(
"\n"
,
""
));
q
=
""
;
i
=
0
;
}
}
}
if
(!
q
.
isBlank
())
queries
.
add
(
q
.
replaceAll
(
"\n"
,
""
));
return
queries
;
}
/**
* @param start
*/
public
static
void
applyWait
(
Instant
start
)
{
try
{
long
timeElapsed
=
Duration
.
between
(
start
,
Instant
.
now
()).
toMillis
();
if
(
timeElapsed
<
DEFAULT_DELAY_MILLIS
)
TimeUnit
.
MILLISECONDS
.
sleep
(
DEFAULT_DELAY_MILLIS
-
timeElapsed
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
/**
* @param sequences
*/
private
static
void
saveResults
(
Map
<
String
,
String
>
sequences
)
{
try
{
String
filePath
=
FilesUtils
.
getKeggFastaDirectory
().
concat
(
FilesUtils
.
generateFileName
(
KEGG_FASTA_NAME
,
".faa"
));
FilesUtils
.
saveLastKnownVersion
(
PATH_LAST_KNOWN_VERSION
,
filePath
);
File
fastaFile
=
new
File
(
filePath
);
FileWriter
fstream
=
new
FileWriter
(
fastaFile
);
BufferedWriter
out
=
new
BufferedWriter
(
fstream
);
for
(
String
seqID
:
sequences
.
keySet
())
{
String
sequence
=
sequences
.
get
(
seqID
);
out
.
write
(
">"
+
seqID
+
"\n"
);
out
.
write
(
sequence
+
"\n\n"
);
}
out
.
close
();
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
src/main/java/pt/uminho/ceb/biosystems/transyt/scraper/APIs/MetaCycAPI.java
View file @
c99b7898
...
...
@@ -30,7 +30,7 @@ public class MetaCycAPI {
public
static
final
int
LIMIT
=
5
;
public
static
final
int
ALL_SEARCH_LIMIT
=
2
;
public
static
final
int
DEFAULT_DELAY_MILLIS
=
10
00
;
public
static
final
int
DEFAULT_DELAY_MILLIS
=
8
00
;
public
static
final
int
BATCH_SIZE
=
250
;
protected
static
final
String
[]
DATABASES
=
new
String
[]
{
"ECOLI"
,
"META"
,
"BSUB"
,
"YEAST"
};
...
...
src/main/java/pt/uminho/ceb/biosystems/transyt/scraper/tcdb/comparison/Compare.java
View file @
c99b7898
...
...
@@ -116,7 +116,7 @@ public class Compare {
descriptionColumn
=
i
+
1
;
data
=
ReadExcelFile
.
getData
(
"C:\\Users\\Davide\\OneDrive - Universidade do Minho\\UMinho\\Tese\\Internal database\\results.xlsx"
);
data
=
ReadExcelFile
.
getData
(
"C:\\Users\\Davide\\OneDrive - Universidade do Minho\\UMinho\\Tese\\Internal database\\results.xlsx"
,
true
,
null
);
performComparison
(
prepareInfomation
());
...
...
src/main/java/pt/uminho/ceb/biosystems/transyt/scraper/tcdb/reactionsGenerator/GenerateTransportReactions.java
View file @
c99b7898
...
...
@@ -106,6 +106,8 @@ public class GenerateTransportReactions {
newTcContainer
.
setFamily
(
tcNumberContainer
.
getFamily
());
tcNumberContainer
.
filterReactionsNotBelongingToTransportType
(
evidence
);
boolean
metacycContainsMiddleCompartment
=
false
;
if
(
metaCycData
.
containsKey
(
accession
))
{
...
...
@@ -115,8 +117,19 @@ public class GenerateTransportReactions {
ReactionContainer
rContainer
=
metacycReactions
.
get
(
rKey
);
rContainer
.
setConfidenceLevel
(
METACYC_CONFIDENCE_LEVEL
);
rContainer
.
setOriginalReaction
(
rContainer
.
getReaction
());
TypeOfTransporter
type
=
FindTransporters
.
findTypeOfTransport2
(
rContainer
,
tcNumber
);
TypeOfTransporter
type
=
null
;
if
(
rContainer
.
getMetabolites
().
size
()
==
1
)
type
=
TypeOfTransporter
.
Uniport
;
else
type
=
FindTransporters
.
findTypeOfTransport2
(
rContainer
,
tcNumber
);
if
(
type
.
equals
(
TypeOfTransporter
.
BiochemicalCoA
))
{
String
[]
reactAux
=
correctMetaCycCoaReactions
(
rContainer
);
rContainer
.
setReactant
(
reactAux
[
0
]);
rContainer
.
setProduct
(
reactAux
[
1
]);
}
rContainer
.
setTransportType
(
type
);
...
...
@@ -128,12 +141,16 @@ public class GenerateTransportReactions {
if
(
evidence
!=
null
)
revEvidence
=
rContainer
.
isReversible
();
if
(!
type
.
equals
(
TypeOfTransporter
.
Default
))
if
(!
type
.
equals
(
TypeOfTransporter
.
Default
))
{
metacycContainsMiddleCompartment
=
rContainer
.
getReaction
().
contains
(
ReactionContainer
.
MIDDLE_COMPARTMENT_TOKEN
);
newTcContainer
.
addReaction
(
rContainer
);
}
}
}
boolean
skip
=
checkIfIgnoreTCDB
(
tcdbMetContainer
.
getMetabolites
(
tcNumber
),
newTcContainer
.
getAllReactionsIds
().
isEmpty
());
boolean
skip
=
checkIfIgnoreTCDB
(
metacycContainsMiddleCompartment
,
tcdbMetContainer
.
getMetabolites
(
tcNumber
),
newTcContainer
.
getAllReactionsIds
().
isEmpty
(),
tcNumberContainer
.
getTransportTypesAssociatedToProtein
());
if
(!
skip
&&
tcNumberContainer
.
getAllReactionsIds
().
size
()
==
0
)
{
...
...
@@ -276,6 +293,39 @@ public class GenerateTransportReactions {
}
private
static
String
[]
correctMetaCycCoaReactions
(
ReactionContainer
rContainer
)
{
String
[]
res
=
new
String
[
2
];
String
reactant
=
rContainer
.
getReactant
();
String
product
=
rContainer
.
getProduct
();
String
[]
aux
=
reactant
.
split
(
"\\s+\\+\\s+"
);
for
(
String
mAux
:
aux
)
{
mAux
=
mAux
.
trim
();
if
(!
mAux
.
equalsIgnoreCase
(
"META:ATP"
)
&&
!
mAux
.
equalsIgnoreCase
(
"META:CO-A"
))
reactant
=
reactant
.
replace
(
mAux
,
mAux
+
" "
+
ReactionContainer
.
EXTERIOR_COMPARTMENT_TOKEN
);
else
reactant
=
reactant
.
replace
(
mAux
,
mAux
+
" "
+
ReactionContainer
.
INTERIOR_COMPARTMENT_TOKEN
);
}
aux
=
product
.
split
(
"\\s+\\+\\s+"
);
for
(
String
mAux
:
aux
)
{
mAux
=
mAux
.
trim
();
product
=
product
.
replace
(
mAux
,
mAux
+
" "
+
ReactionContainer
.
INTERIOR_COMPARTMENT_TOKEN
);
}
res
[
0
]
=
reactant
;
res
[
1
]
=
product
+
" + META:PROTON (in)"
;
return
res
;
}
/**
* Method to assess if TCDB reactions should be ignored.
*
...
...
@@ -283,7 +333,13 @@ public class GenerateTransportReactions {
* @param currentContainerIsEmpty
* @return
*/
private
static
boolean
checkIfIgnoreTCDB
(
List
<
String
>
metabolites
,
boolean
currentContainerIsEmpty
)
{
private
static
boolean
checkIfIgnoreTCDB
(
boolean
containsMiddleCompartment
,
List
<
String
>
metabolites
,
boolean
currentContainerIsEmpty
,
Set
<
TypeOfTransporter
>
types
)
{
if
(
containsMiddleCompartment
)
return
true
;
if
(
types
.
contains
(
TypeOfTransporter
.
BiochemicalCoA
))
return
false
;
if
(!
currentContainerIsEmpty
)
{
...
...
@@ -544,7 +600,7 @@ public class GenerateTransportReactions {
ReactionContainer
container
=
newTcContainer
.
getReactionContainer
(
id
);
if
(!
container
.
get
TransportType
().
equals
(
toKeep
))
if
(!
container
.
get
ConfidenceLevel
().
equals
(
METACYC_CONFIDENCE_LEVEL
)
&&
!
container
.
getTransportType
().
equals
(
toKeep
)
&&
!
container
.
getTransportType
().
equals
(
TypeOfTransporter
.
Biochemical
))
newTcContainer
.
removeReaction
(
id
);
}
}
...
...
src/main/java/pt/uminho/ceb/biosystems/transyt/scraper/tcdb/tcdbTransportTypesRetriever/FindTransporters.java
View file @
c99b7898
...
...
@@ -422,8 +422,11 @@ public class FindTransporters {
String
reactant
=
reaction
.
getReactant
();
String
product
=
reaction
.
getProduct
();
if
(
reaction
.
getReaction
().
contains
(
ReactionContainer
.
MIDDLE_COMPARTMENT_TOKEN
))
return
TypeOfTransporter
.
Biochemical
;
if
(
tc
.
matches
(
"4\\.A\\..+"
))
else
if
(
tc
.
matches
(
"4\\.A\\..+"
))
return
TypeOfTransporter
.
PEPdependent
;
else
if
(
reactant
.
contains
(
":ATP"
)
&&
reactant
.
contains
(
":CO-A"
))
...
...
src/main/java/pt/uminho/ceb/biosystems/transyt/scraper/tcdb/tcdbTransportTypesRetriever/Retriever.java
View file @
c99b7898
...
...
@@ -4,10 +4,13 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Set
;
import
java.util.concurrent.TimeUnit
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
pt.uminho.ceb.biosystems.transyt.scraper.APIs.KeggAPI
;
import
pt.uminho.ceb.biosystems.transyt.scraper.APIs.MetaCycAPI
;
import
pt.uminho.ceb.biosystems.transyt.scraper.tcdb.reactionsGenerator.GenerateTransportReactions
;
import
pt.uminho.ceb.biosystems.transyt.scraper.tcdb.utilities.ProcessTcdbMetabolitesExcel
;
import
pt.uminho.ceb.biosystems.transyt.utilities.capsules.ReactionContainer
;
...
...
@@ -20,180 +23,218 @@ import pt.uminho.ceb.biosystems.transyt.utilities.files.FilesUtils;
import
pt.uminho.ceb.biosystems.transyt.utilities.files.JSONFilesUtils
;
public
class
Retriever
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Retriever
.
class
);
public
static
void
runRetriever
(
boolean
useCache
,
boolean
tests
,
String
accTest
)
throws
Exception
{
try
{
///////// - SEARCHER
logger
.
info
(
"Retrieving TCDB FASTA file..."
);
if
(!
useCache
)
TcdbRetriever
.
getSubstrates
();
ReadFastaTcdb
.
buildFastaFileForAlignments
();
Set
<
String
>
tcNumbers
=
TcdbExplorer
.
getTcNumbers
(
true
);