Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
memoriav
Memobase 2020
services
Import Process
XML Data Transform
Commits
52b1748a
Commit
52b1748a
authored
Oct 13, 2020
by
Jonas Waeber
Browse files
Reworked implementation
parent
55bbe8b4
Changes
23
Hide whitespace changes
Inline
Side-by-side
build.gradle
View file @
52b1748a
...
@@ -2,12 +2,11 @@ plugins {
...
@@ -2,12 +2,11 @@ plugins {
id
'application'
id
'application'
id
'distribution'
id
'distribution'
id
'org.jetbrains.kotlin.jvm'
version
'1.3.71'
id
'org.jetbrains.kotlin.jvm'
version
'1.3.71'
id
'
com.
palantir
.git
-version'
version
'0.11.0'
id
"
com.
gitlab.morality
.g
r
it
"
version
"2.0.2"
id
'org.jlleitschuh.gradle.ktlint'
version
'9.2.1'
id
'org.jlleitschuh.gradle.ktlint'
version
'9.2.1'
}
}
group
'org.memobase'
group
'org.memobase'
version
=
gitVersion
()
mainClassName
=
'org.memobase.App'
mainClassName
=
'org.memobase.App'
jar
{
jar
{
...
@@ -38,8 +37,8 @@ dependencies {
...
@@ -38,8 +37,8 @@ dependencies {
implementation
"org.apache.logging.log4j:log4j-slf4j-impl:${log4jV}"
implementation
"org.apache.logging.log4j:log4j-slf4j-impl:${log4jV}"
implementation
"org.apache.kafka:kafka-streams:${kafkaV}"
implementation
"org.apache.kafka:kafka-streams:${kafkaV}"
implementation
'ch.memobase:memobase-kafka-utils:0.1.2'
implementation
'org.memobase:memobase-service-utilities:0.1
4.2
'
implementation
'org.memobase:memobase-service-utilities:0.1
6.0
'
// SFTP Client
// SFTP Client
// is needed because of a bug.
// is needed because of a bug.
implementation
'com.hierynomus:sshj:0.27.0'
implementation
'com.hierynomus:sshj:0.27.0'
...
...
helm-charts/values.yaml
View file @
52b1748a
...
@@ -8,8 +8,6 @@ image: "memoriav/memobase-2020/services/import-process/xml-data-transform"
...
@@ -8,8 +8,6 @@ image: "memoriav/memobase-2020/services/import-process/xml-data-transform"
tag
:
"
latest"
tag
:
"
latest"
deploymentName
:
xml-data-transform
deploymentName
:
xml-data-transform
## TODO: This needs to be solved differently. This way it is not possible to deploy a replica-set.
## somehow the id needs to be dependent on the pod name?
applicationId
:
xml-data-transform-app
applicationId
:
xml-data-transform-app
kafkaConfigs
:
prod-kafka-bootstrap-servers
kafkaConfigs
:
prod-kafka-bootstrap-servers
...
...
src/main/kotlin/HeaderExtraction.kt
deleted
100644 → 0
View file @
55bbe8b4
/*
* xml-data-transform
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase
import
org.apache.kafka.common.header.Headers
import
org.apache.kafka.streams.kstream.ValueTransformer
import
org.apache.kafka.streams.kstream.ValueTransformerSupplier
import
org.apache.kafka.streams.processor.ProcessorContext
class
HeaderExtraction
:
ValueTransformer
<
Message
,
TransformMetadata
>
{
private
val
extractedHeaders
=
mutableMapOf
<
String
,
String
>()
override
fun
init
(
context
:
ProcessorContext
?)
{
if
(
context
!=
null
)
{
extractedHeaders
[
"sessionId"
]
=
extract
(
"sessionId"
,
context
.
headers
())
extractedHeaders
[
"recordSetId"
]
=
extract
(
"recordSetId"
,
context
.
headers
())
extractedHeaders
[
"recordTag"
]
=
extract
(
"xmlRecordTag"
,
context
.
headers
())
extractedHeaders
[
"idTag"
]
=
extract
(
"xmlIdentifierFieldName"
,
context
.
headers
())
}
}
private
fun
extract
(
tag
:
String
,
headers
:
Headers
):
String
{
val
headerValues
=
headers
.
headers
(
tag
)
return
headerValues
.
first
().
value
().
toString
()
}
override
fun
transform
(
value
:
Message
):
TransformMetadata
{
return
TransformMetadata
(
value
.
path
,
extractedHeaders
[
"recordTag"
]
as
String
,
extractedHeaders
[
"idTag"
]
as
String
)
}
override
fun
close
()
{
}
}
class
HeaderExtractionSupplier
:
ValueTransformerSupplier
<
Message
,
TransformMetadata
>
{
override
fun
get
():
ValueTransformer
<
Message
,
TransformMetadata
>
{
return
HeaderExtraction
()
}
}
\ No newline at end of file
src/main/kotlin/KafkaTopology.kt
View file @
52b1748a
...
@@ -18,78 +18,112 @@
...
@@ -18,78 +18,112 @@
package
org.memobase
package
org.memobase
import
ch.memobase.kafka.utils.ConfigJoiner
import
ch.memobase.kafka.utils.models.ImportService
import
com.beust.klaxon.Klaxon
import
com.beust.klaxon.Klaxon
import
net.schmizz.sshj.sftp.RemoteFile
import
org.apache.kafka.common.serialization.Serdes
import
org.apache.kafka.streams.KeyValue
import
org.apache.kafka.streams.KeyValue
import
org.apache.kafka.streams.StreamsBuilder
import
org.apache.kafka.streams.StreamsBuilder
import
org.apache.kafka.streams.Topology
import
org.apache.kafka.streams.Topology
import
org.apache.kafka.streams.kstream.KStream
import
org.apache.kafka.streams.kstream.KStream
import
org.apache.
kafka.streams.kstream.Predicate
import
org.apache.
logging.log4j.LogManager
import
org.
apache.kafka.streams.kstream.ValueTransformerSupplier
import
org.
memobase.models.*
import
org.memobase.settings.SettingsLoader
import
org.memobase.settings.SettingsLoader
import
org.memobase.sftp.SftpClient
import
org.memobase.sftp.SftpClient
import
org.memobase.utils.MissingIdentifierException
import
org.memobase.utils.SerdeMessage
import
org.memobase.utils.XsltException
import
org.memobase.xml.XMLTransformer
import
settings.HeaderExtractionTransformSupplier
import
java.io.File
import
java.io.File
import
java.io.InputStream
import
java.io.StringReader
import
java.io.StringReader
class
KafkaTopology
(
private
val
settings
:
SettingsLoader
)
{
class
KafkaTopology
(
private
val
settings
:
SettingsLoader
)
{
private
val
log
=
LogManager
.
getLogger
(
"XMLDataImport"
)
private
val
sftpClient
:
SftpClient
=
SftpClient
(
settings
.
sftpSettings
)
private
val
sftpClient
:
SftpClient
=
SftpClient
(
settings
.
sftpSettings
)
private
val
xmlTransformer
=
XMLTransformer
(
settings
.
appSettings
)
private
val
xmlTransformer
=
XMLTransformer
()
private
val
reportingTopic
=
settings
.
outputTopic
+
"-reporting"
private
val
configJoiner
=
ConfigJoiner
<
Message
,
ByteArray
>(
ImportService
.
Transform
,
SerdeMessage
(),
Serdes
.
ByteArray
(),
xmlTransformer
::
xsltFunction
)
private
val
reportingTopic
=
settings
.
processReportTopic
fun
build
():
Topology
{
fun
build
():
Topology
{
val
builder
=
StreamsBuilder
()
val
builder
=
StreamsBuilder
()
val
stream
=
builder
val
configStream
=
builder
.
stream
<
ByteArray
,
ByteArray
>(
"import-process-config"
)
val
dataStream
=
builder
.
stream
<
String
,
String
>(
settings
.
inputTopic
)
.
stream
<
String
,
String
>(
settings
.
inputTopic
)
.
flatMapValues
{
_
,
value
->
parseMessage
(
value
)
}
.
flatMapValues
{
_
,
value
->
parseMessage
(
value
)
}
.
filter
{
_
,
value
->
value
.
format
!=
Formats
.
xml
}
.
filter
{
_
,
value
->
value
.
format
!=
Formats
.
xml
}
.
transformValues
(
HeaderExtractionSupplier
())
.
mapValues
{
value
->
Pair
(
value
,
sftpClient
.
open
(
File
(
value
.
path
)))
}
val
joinedStream
=
configJoiner
.
join
(
dataStream
,
configStream
)
.
map
{
key
,
value
->
value
.
second
.
use
{
val
saxHandlerStream
=
joinedStream
xmlTransformer
.
applyXSLT
(
key
,
value
.
first
,
it
)
.
mapValues
{
value
->
Input
(
value
.
left
,
value
.
right
)
}
}
.
transformValues
(
HeaderExtractionTransformSupplier
<
Input
>())
.
mapValues
{
value
->
Content
(
value
.
first
.
message
,
value
.
second
,
value
.
first
.
xsltData
,
sftpClient
.
open
(
File
(
value
.
first
.
message
.
path
))
)
}
}
.
map
{
key
,
value
->
value
.
inputStream
.
use
{
try
{
val
output
=
xmlTransformer
.
applyXSLT
(
key
,
value
.
headerMetadata
,
value
.
inputStream
,
value
.
xsltData
)
KeyValue
(
output
.
first
,
Pair
(
output
.
second
.
output
.
toString
(),
output
.
second
.
getReport
())
)
}
catch
(
ex
:
XsltException
)
{
log
.
error
(
ex
.
message
)
KeyValue
(
key
,
Pair
(
null
,
Report
(
key
,
ReportStatus
.
failure
,
ex
.
localizedMessage
)
)
)
streamOutput
(
stream
)
}
catch
(
ex
:
MissingIdentifierException
)
{
log
.
error
(
ex
.
message
)
KeyValue
(
key
,
Pair
(
null
,
Report
(
key
,
ReportStatus
.
failure
,
ex
.
localizedMessage
)
)
)
}
}
}
streamOutput
(
saxHandlerStream
)
return
builder
.
build
()
return
builder
.
build
()
}
}
private
fun
streamOutput
(
stream
:
KStream
<
String
,
SAXContentHandler
>)
{
private
fun
streamOutput
(
stream
:
KStream
<
String
,
Pair
<
String
?
,
Report
>
>)
{
stream
stream
.
mapValues
{
value
->
value
.
output
.
toString
()
}
.
filter
{
_
,
value
->
value
!=
null
}
.
mapValues
{
value
->
value
.
first
}
.
to
(
settings
.
outputTopic
)
.
to
(
settings
.
outputTopic
)
stream
stream
.
mapValues
{
value
->
value
.
getReport
()
.
toJson
()
}
.
mapValues
{
value
->
value
.
second
.
toJson
()
}
.
to
(
reportingTopic
)
.
to
(
reportingTopic
)
stream
.
mapValues
{
value
->
val
report
=
value
.
getReport
()
if
(
report
.
status
==
ReportStatus
.
success
)
{
ProcessReport
(
"xml-data-transform"
,
ReportStatus
.
success
,
1
,
1
,
0
)
}
else
{
ProcessReport
(
"xml-data-transform"
,
ReportStatus
.
failure
,
1
,
0
,
1
)
}
}
.
mapValues
{
value
->
value
.
toJson
()
}
.
to
(
settings
.
processReportTopic
)
}
}
private
fun
parseMessage
(
value
:
String
):
List
<
Message
>
{
private
fun
parseMessage
(
value
:
String
):
List
<
Message
>
{
...
...
src/main/kotlin/ProcessReport.kt
deleted
100644 → 0
View file @
55bbe8b4
/*
* xml-data-transform
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase
import
com.beust.klaxon.Klaxon
data class
ProcessReport
(
val
id
:
String
,
val
status
:
String
,
val
total
:
Int
,
val
successes
:
Int
,
val
failures
:
Int
)
{
fun
toJson
():
String
{
return
Klaxon
().
toJsonString
(
this
)
}
}
src/main/kotlin/Values.kt
deleted
100644 → 0
View file @
55bbe8b4
/*
* XML Data Transform Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase
import
com.beust.klaxon.json
object
Formats
{
const
val
invalid
=
"INVALID"
const
val
xml
=
"XML"
const
val
error
=
"ERROR"
}
object
Extensions
{
const
val
csv
=
"csv"
const
val
tsv
=
"tsv"
const
val
xlsx
=
"xlsx"
const
val
xls
=
"xls"
const
val
ods
=
"ods"
}
object
ReportStatus
{
const
val
success
=
"SUCCESS"
const
val
failure
=
"FAILURE"
}
object
ErrorResult
{
val
result
=
json
{
obj
(
Pair
(
"message"
,
Formats
.
error
))
}
}
object
ReportMessages
{
fun
processFailure
(
fileName
:
String
,
message
:
String
):
String
{
return
"Could not process file $fileName, because $message"
}
fun
processSuccess
(
count
:
Int
):
String
{
return
"Transformed table data into $count records."
}
fun
invalidFile
(
fileName
:
String
,
message
:
String
):
String
{
return
"Invalid Input Error: $message for file $fileName."
}
fun
reportSuccess
(
identifier
:
String
,
count
:
Int
):
String
{
return
"Successfully transformed row $count into key-value map with identifier $identifier."
}
fun
reportFailure
(
message
:
String
):
String
{
return
"Invalid Input Error: $message"
}
}
src/main/kotlin/models/Content.kt
0 → 100644
View file @
52b1748a
package
org.memobase.models
import
net.sf.saxon.s9api.XsltExecutable
import
settings.HeaderMetadata
import
java.io.InputStream
data class
Content
(
val
message
:
Message
,
val
headerMetadata
:
HeaderMetadata
,
val
xsltData
:
ByteArray
,
val
inputStream
:
InputStream
)
\ No newline at end of file
src/main/kotlin/
TransformMetad
at
a
.kt
→
src/main/kotlin/
models/Form
at
s
.kt
View file @
52b1748a
/*
/*
*
xml-d
ata
-t
ransform
*
XML D
ata
T
ransform
Service
* Copyright (C) 2020 Memoriav
* Copyright (C) 2020 Memoriav
*
*
* This program is free software: you can redistribute it and/or modify
* This program is free software: you can redistribute it and/or modify
...
@@ -16,10 +16,8 @@
...
@@ -16,10 +16,8 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
*/
package
org.memobase
package
org.memobase
.models
data class
TransformMetadata
(
object
Formats
{
val
path
:
String
,
const
val
xml
=
"XML"
val
recordTag
:
String
,
}
val
identifierTag
:
String
)
\ No newline at end of file
src/main/kotlin/models/Input.kt
0 → 100644
View file @
52b1748a
package
org.memobase.models
data class
Input
(
val
message
:
Message
,
val
xsltData
:
ByteArray
)
src/main/kotlin/Message.kt
→
src/main/kotlin/
models/
Message.kt
View file @
52b1748a
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
*/
package
org.memobase
package
org.memobase
.models
data class
Message
(
data class
Message
(
val
path
:
String
,
val
path
:
String
,
...
...
src/main/kotlin/Report.kt
→
src/main/kotlin/
models/
Report.kt
View file @
52b1748a
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
*/
package
org.memobase
package
org.memobase
.models
import
com.beust.klaxon.Klaxon
import
com.beust.klaxon.Klaxon
...
...
src/main/kotlin/models/ReportStatus.kt
0 → 100644
View file @
52b1748a
package
org.memobase.models
object
ReportStatus
{
const
val
success
=
"SUCCESS"
const
val
failure
=
"FAILURE"
}
\ No newline at end of file
src/main/kotlin/utils/MissingIdentifierException.kt
0 → 100644
View file @
52b1748a
package
org.memobase.utils
class
MissingIdentifierException
(
key
:
String
,
field
:
String
)
:
Exception
(
"Could not extract an identifier from resource $key in field $field."
)
src/main/kotlin/utils/SerdeMessage.kt
0 → 100644
View file @
52b1748a
package
org.memobase.utils
import
com.beust.klaxon.Klaxon
import
org.apache.kafka.common.serialization.Deserializer
import
org.apache.kafka.common.serialization.Serde
import
org.apache.kafka.common.serialization.Serializer
import
org.memobase.models.Message
import
java.io.ByteArrayInputStream
import
java.nio.charset.Charset
class
SerdeMessage
:
Serde
<
Message
>
{
private
val
klaxon
=
Klaxon
()
override
fun
serializer
():
Serializer
<
Message
>
{
return
Serializer
{
_
,
data
->
klaxon
.
toJsonString
(
data
).
toByteArray
()
}
}
override
fun
deserializer
():
Deserializer
<
Message
>
{
return
Deserializer
{
_
,
data
->
klaxon
.
parse
<
Message
>(
ByteArrayInputStream
(
data
))
?:
error
(
"Could not deserialize message: ${
data
.
toString
(
Charset
.
defaultCharset
()
)
}
"
)
}
}
}
\ No newline at end of file
src/main/kotlin/utils/XsltException.kt
0 → 100644
View file @
52b1748a
package
org.memobase.utils
import
net.sf.saxon.s9api.StaticError
class
XsltException
(
errorList
:
List
<
StaticError
>)
:
Exception
(
"Found error while parsing XSLT: "
+
errorList
.
joinToString
(
separator
=
"; "
)
{
it
.
message
})
\ No newline at end of file
src/main/kotlin/SAXContentHandler.kt
→
src/main/kotlin/
xml/
SAXContentHandler.kt
View file @
52b1748a
...
@@ -16,24 +16,46 @@
...
@@ -16,24 +16,46 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
*/
package
org.memobase
package
org.memobase
.xml
import
com.beust.klaxon.JsonObject
import
com.beust.klaxon.JsonObject
import
org.apache.logging.log4j.LogManager
import
org.apache.logging.log4j.LogManager
import
org.memobase.models.Report
import
org.memobase.models.ReportStatus
import
org.xml.sax.Attributes
import
org.xml.sax.Attributes
import
org.xml.sax.ContentHandler
import
org.xml.sax.ContentHandler
import
org.xml.sax.Locator
import
org.xml.sax.Locator
import
java.io.StringWriter
import
java.io.StringWriter
/**
* Class to transform a xml stream into a json representation. Expects a flat xml preprocessed with
* a xslt if necessary.
*
* Can only handle elements up to one level deep and ignores attributes.
*
* @param key The key of the kafka message.
* @param identifierFieldName The field name of the unique identifier of this record.
* @param recordTag The root tag of the xml structure.
*/
class
SAXContentHandler
(
key
:
String
,
private
val
identifierFieldName
:
String
,
private
val
recordTag
:
String
)
:
class
SAXContentHandler
(
key
:
String
,
private
val
identifierFieldName
:
String
,
private
val
recordTag
:
String
)
:
ContentHandler
{
ContentHandler
{
private
val
log
=
LogManager
.
getLogger
(
"SAXHandler"
)
private
val
log
=
LogManager
.
getLogger
(
"SAXHandler"
)
/**
* The json representation of the xml stream after processing.
*/
val
output
=
StringWriter
()
val
output
=
StringWriter
()
/**
* The identifier is used as a message key for the outgoing message.
*/
var
identifier
:
String
=
key
var
identifier
:
String
=
key
private
var
report
:
Report
?
=
null
private
var
report
:
Report
?
=
null
private
val
jsonResult
=
JsonObject
()
private
val
jsonResult
=
JsonObject
()
/**
* @return A report on the status of the transformation.
*/
fun
getReport
():
Report
{
fun
getReport
():
Report
{
return
report
.
let
{
return
report
.
let
{
it
it
...
...
src/main/kotlin/XMLTransformer.kt
→
src/main/kotlin/
xml/
XMLTransformer.kt
View file @
52b1748a
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
* You should have received a copy of the GNU Affero General Public License
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
*/
package
org.memobase
package
org.memobase
.xml