Commit 7223ee39 authored by Jonas Waeber's avatar Jonas Waeber

First implementation for xml data transform

parent 5fa97705
......@@ -32,31 +32,24 @@ ext {
}
dependencies {
// https://mvnrepository.com/artifact/org.elasticsearch.client/elasticsearch-rest-high-level-client
//compile group: 'org.elasticsearch.client', name: 'elasticsearch-rest-high-level-client', version: '7.1.0'
// Logging Framework
implementation "org.apache.logging.log4j:log4j-api:${log4jV}"
implementation "org.apache.logging.log4j:log4j-core:${log4jV}"
implementation "org.apache.logging.log4j:log4j-slf4j-impl:${log4jV}"
// Kafka Imports
implementation group: 'org.apache.kafka', name: 'kafka-clients', version: kafkaV
implementation "org.apache.kafka:kafka-streams:${kafkaV}"
implementation 'org.memobase:memobase-service-utilities:1.4.0'
// CSV Reader
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:0.7.3")
// XSLX / XSL Reader
implementation 'org.apache.poi:poi:4.1.2'
implementation 'org.apache.poi:poi-ooxml:4.1.2'
// ODS Reader
implementation 'org.odftoolkit:odftoolkit:1.0.0-BETA1'
implementation 'org.memobase:memobase-service-utilities:1.7.1'
// https://mvnrepository.com/artifact/net.sf.saxon/Saxon-HE
compile group: 'net.sf.saxon', name: 'Saxon-HE', version: '9.9.1-7'
// used by saxon library
// https://mvnrepository.com/artifact/com.ibm.icu/icu4j
compile group: 'com.ibm.icu', name: 'icu4j', version: '67.1'
// JSON Parser
implementation 'com.beust:klaxon:5.2'
// Compression
//implementation "org.apache.commons:commons-compress:1.19"
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'
implementation "org.jetbrains.kotlin:kotlin-script-runtime:1.3.71"
......
/*
* Table Data Import Service
* XML Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
......@@ -24,37 +24,16 @@ import org.apache.kafka.streams.StreamsBuilder
import org.apache.kafka.streams.Topology
import org.apache.kafka.streams.kstream.KStream
import org.apache.kafka.streams.kstream.Predicate
import org.apache.logging.log4j.LogManager
import org.memobase.settings.SettingsLoader
import org.memobase.sftp.SftpClient
import java.io.File
import java.io.FileInputStream
import java.io.InputStream
import java.io.StringReader
import javax.xml.transform.TransformerFactory
import javax.xml.transform.sax.SAXResult
import javax.xml.transform.stream.StreamSource
import kotlin.system.exitProcess
class KafkaTopology(private val settings: SettingsLoader) {
private val log = LogManager.getLogger("KafkaTopologySetup")
private val sftpClient: SftpClient = SftpClient(settings.sftpSettings)
private val xlstFilePath = settings.appSettings.getProperty("xsltFilePath")
private val identifierFieldName = settings.appSettings.getProperty("identifierFieldName")
private val recordTag = settings.appSettings.getProperty("recordTag")
init {
if (identifierFieldName == "placeholderValue") {
log.error("Requires a value for identifier field name, but found default value.")
exitProcess(1)
}
}
private val factory = TransformerFactory.newInstance()
private val xslt = StreamSource(FileInputStream(File(xlstFilePath)))
private val transformer = factory.newTransformer(xslt)
private val xmlTransformer = XMLTransformer(settings.appSettings)
private val reportingTopic = settings.outputTopic + "-reporting"
fun build(): Topology {
......@@ -131,11 +110,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
}
private fun transformXml(key: String, data: InputStream): KeyValue<String, SAXContentHandler> {
val contentHandler = SAXContentHandler(key, identifierFieldName, recordTag)
data.use {
transformer.transform(StreamSource(it), SAXResult(contentHandler))
}
return KeyValue(contentHandler.identifier, contentHandler)
return xmlTransformer.applyXSLT(key, data)
}
......
......@@ -19,12 +19,15 @@
package org.memobase
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.xml.sax.Attributes
import org.xml.sax.ContentHandler
import org.xml.sax.Locator
import java.io.StringWriter
class SAXContentHandler(key: String, private val identifierFieldName: String, private val recordTag: String) : ContentHandler {
private val log = LogManager.getLogger("SAXHandler")
val output = StringWriter()
var identifier: String = key
var report: Report? = null
......@@ -70,10 +73,19 @@ class SAXContentHandler(key: String, private val identifierFieldName: String, pr
override fun characters(characters: CharArray?, start: Int, size: Int) {
if (characters != null) {
if (currentInnerElementTag != "") {
currentInnerElementContent += characters.toString()
} else if (currentElementTag != "") {
currentElementContent += characters.toString()
val line = characters.joinToString("").trim()
if (line.isNotEmpty()) {
when {
currentInnerElementTag != "" -> {
currentInnerElementContent += line
}
currentElementTag != "" -> {
currentElementContent += line
}
else -> {
log.error(line)
}
}
}
}
}
......@@ -109,14 +121,19 @@ class SAXContentHandler(key: String, private val identifierFieldName: String, pr
identifier = currentElementContent
}
if (innerElements.isEmpty()) {
jsonResult[currentElementTag] = currentElementContent
currentElementContent = ""
if (currentElementContent.isNotEmpty()) {
jsonResult[currentElementTag] = currentElementContent
}
} else {
jsonResult[currentElementTag] = innerElements.toMap()
innerElements.clear()
}
currentElementTag = ""
currentElementContent = ""
} else if (currentInnerElementTag == localName) {
innerElements.add(Pair(currentInnerElementTag, currentInnerElementContent))
if (currentInnerElementContent.isNotEmpty()) {
innerElements.add(Pair(currentInnerElementTag, currentInnerElementContent))
}
currentInnerElementTag = ""
currentInnerElementContent = ""
} else {
......
/*
* text-file-validation
* XML Data Transform Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
......
/*
* xml-data-transform
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import net.sf.saxon.s9api.Processor
import net.sf.saxon.s9api.SAXDestination
import net.sf.saxon.s9api.StaticError
import net.sf.saxon.s9api.XsltExecutable
import org.apache.kafka.streams.KeyValue
import org.apache.logging.log4j.LogManager
import java.io.File
import java.io.FileInputStream
import java.io.InputStream
import java.util.Properties
import javax.xml.transform.stream.StreamSource
class XMLTransformer(appSettings: Properties) {
private val log = LogManager.getLogger("XMLTransformer")
private val xlstFilePath = appSettings.getProperty("xsltFilePath")
private val identifierFieldName = appSettings.getProperty("identifierFieldName")
private val recordTag = appSettings.getProperty("recordTag")
init {
if (identifierFieldName == "placeholderValue") {
log.error("Requires a value for identifier field name, but found default value.")
}
}
private val processor = Processor(false)
private val xslt = compileXslt()
private val transformer = xslt.load()
private fun compileXslt(): XsltExecutable {
val errorList = mutableListOf<StaticError>()
val xsltCompiler = processor.newXsltCompiler()
xsltCompiler.setErrorList(errorList)
val source = StreamSource(FileInputStream(File(xlstFilePath)))
val executable = xsltCompiler.compile(source)
if (errorList.isEmpty()) {
return executable
} else {
throw Exception(errorList.joinToString())
}
}
fun applyXSLT(key: String, data: InputStream): KeyValue<String, SAXContentHandler> {
val contentHandler = SAXContentHandler(key, identifierFieldName, recordTag)
data.use {
transformer.setSource(StreamSource(it))
transformer.destination = SAXDestination(contentHandler)
transformer.transform()
}
if (contentHandler.identifier.isEmpty()) {
throw Exception("No valid identifier found in record $key in field $identifierFieldName.")
} else {
return KeyValue(contentHandler.identifier, contentHandler)
}
}
}
\ No newline at end of file
......@@ -29,10 +29,13 @@ import org.apache.kafka.streams.TopologyTestDriver
import org.apache.kafka.streams.test.ConsumerRecordFactory
import org.apache.logging.log4j.LogManager
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.TestInstance
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.MethodSource
import org.memobase.testing.EmbeddedSftpServer
import java.io.InputStream
import java.util.Properties
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
class Tests {
......@@ -43,6 +46,10 @@ class Tests {
return File("$resourcePath/$fileName").readText(Charset.defaultCharset())
}
private fun createInputStream(fileName: String): InputStream {
return File("src/test/resources/$fileName").inputStream()
}
private val sftpServer = EmbeddedSftpServer(22000, "user", "password")
init {
......@@ -56,6 +63,20 @@ class Tests {
}
}
@Test
fun `test xslt tranform`() {
val props = Properties()
props.setProperty("xsltFilePath", "src/test/resources/test-transformer.xslt")
props.setProperty("identifierFieldName", "identifierMain")
props.setProperty("recordTag", "record")
val transformer = XMLTransformer(props)
val result = transformer.applyXSLT("", createInputStream("test-transformer.xml"))
assertThat(result.key)
.isEqualTo("ADG-102821")
assertThat(result.value.output)
.isEqualTo("")
}
/*
@ParameterizedTest
@MethodSource("testParams")
fun `test inputs`(params: TestParams) {
......@@ -117,5 +138,5 @@ class Tests {
Report("", "", ""),
Report("", "", "")
)
)
)*/
}
<?xml version="1.0"?>
<!--
~ xml-data-transform
~ Copyright (C) 2020 Memoriav
~
~ This program is free software: you can redistribute it and/or modify
~ it under the terms of the GNU Affero General Public License as published by
~ the Free Software Foundation, either version 3 of the License, or
~ (at your option) any later version.
~
~ This program is distributed in the hope that it will be useful,
~ but WITHOUT ANY WARRANTY; without even the implied warranty of
~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
~ GNU Affero General Public License for more details.
~
~ You should have received a copy of the GNU Affero General Public License
~ along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template name="root">
<creator>
</creator>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
version="2.0"
xmlns:ns2="http://purl.org/dc/elements/1.1/"
xmlns:foxml="info:fedora/fedora-system:def/foxml#"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:audit="info:fedora/fedora-system:def/audit#"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
xmlns:fedora="info:fedora/fedora-system:def/relations-external#"
xmlns:fedora-model="info:fedora/fedora-system:def/model#"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xslt="http://www.w3.org/1999/XSL/Transform">
<xsl:output
indent="yes"
method="xml"
/>
<!--<xsl:template match="/digitalObject/datastream/datastreamVersion/xmlContent/ebuCoreMain/coreMetadata">
<xsl:copy-of select="."/>
</xsl:template>-->
<xsl:template match="coreMetadata">
<xsl:element name="record">
<!--<xsl:apply-templates select="test"/>
<xsl:apply-templates select="title, ns2:title, alternativeTitle, subject, description, format/essenceLocator, format/medium, type/*, references"/>-->
<xsl:apply-templates />
</xsl:element>
</xsl:template>
<!-- content of child node is copied to parent node -->
<xsl:template match="title | format/essenceLocator | format/duration | format/start | language | references | coverage/spatial/location">
<xsl:element name="{local-name()}">
<xsl:value-of select="child::*"/>
</xsl:element>
</xsl:template>
<!-- content of typeLabel is transformed to an element name and content of child node is copied to it-->
<xsl:template match="description[@typeLabel] | alternativeTitle[@typeLabel] | subject[@typeLabel]">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="{$typeLabel}">
<xsl:value-of select="child::*"/>
</xsl:element>
</xsl:template>
<!-- content of typeLabel is copied as content of the node -->
<xsl:template match="format/medium | format/dataFormat/captioningFormat | type/*">
<xsl:element name="{local-name()}">
<xsl:value-of select="@typeLabel | @language"/>
</xsl:element>
</xsl:template>
<xsl:template match="identifier">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="{$typeLabel}">
<xsl:value-of select="child::ns2:identifier"/>
</xsl:element>
</xsl:template>
<xsl:template match="format/videoFormat | format/imageFormat | format/audioFormat">
<xsl:variable name="format" select="local-name()"/>
<xsl:for-each select="technicalAttributeString">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="{$format}{$typeLabel}">
<xsl:value-of select="."/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<!-- ToDo: role auslesen und dabei besitzende Insitution ausschliessen, producer auf Grund Rolle in eigenes Feld -->
<xsl:template match="contributor | creator | publisher">
<xsl:variable name="type" select="local-name()"/>
<xsl:for-each select=".">
<xsl:choose>
<xsl:when test="child::organisationDetails">
<xsl:element name="{$type}CorporateBody">
<xsl:element name="name">
<xsl:value-of select="descendant::organisationName"/>
</xsl:element>
<xsl:if test="child::role">
<xsl:element name="role">
<xsl:value-of select="child::role[@typeLabel]"/> <!-- funktioniert noch nicht -->
</xsl:element>
</xsl:if>
</xsl:element>
</xsl:when>
<xsl:when test="child::contactDetails">
<xsl:element name="{$type}Person">
<xsl:element name="name">
<xsl:value-of select="descendant::name"/>
</xsl:element>
<xsl:if test="child::role">
<xsl:element name="role">
<xsl:value-of select="child::role[@typeLabel]"/> <!-- funktioniert noch nicht -->
</xsl:element>
</xsl:if>
</xsl:element>
</xsl:when>
</xsl:choose>
</xsl:for-each>
</xsl:template>
<xsl:template match="rights[@typeLabel='Access']/ns2:rights">
<xsl:if test="matches(.,'onsite')">
<xsl:element name="accessPhsyical">
<xsl:text>onsite</xsl:text>
</xsl:element>
</xsl:if>
<xsl:if test="matches(.,'noonsite')">
<xsl:element name="accessDigital">
<xsl:text>noonsite</xsl:text>
</xsl:element>
</xsl:if>
<xsl:if test="matches(.,'public')">
<xsl:element name="accessDigital">
<xsl:text>public</xsl:text>
</xsl:element>
</xsl:if>
<xsl:if test="matches(.,'private')">
<xsl:element name="accessDigital">
<xsl:text>private</xsl:text>
</xsl:element>
</xsl:if>
<xsl:if test="matches(.,'faro')">
<xsl:element name="accessDigital">
<xsl:text>faro</xsl:text>
</xsl:element>
</xsl:if>
</xsl:template>
<xsl:template match="rights[@typeLabel='Holder']/rightsHolder/contactDetails/name">
<xsl:element name="rightsHolder">
<xsl:value-of select="."/>
</xsl:element>
</xsl:template>
<xsl:template match="date">
<xsl:for-each select=".">
<xsl:choose>
<xsl:when test="child::created">
<xsl:element name="dateCreated">
<xsl:call-template name="dates"/>
</xsl:element>
</xsl:when>
<xsl:when test="child::issued">
<xsl:element name="dateIssued">
<xsl:call-template name="dates"/>
</xsl:element>
</xsl:when>
</xsl:choose>
</xsl:for-each>
</xsl:template>
<!-- ToDo: ausbauen für alle attributes von date -->
<xsl:template name="dates">
<xsl:for-each select="child::*">
<xsl:value-of select="@startDate"/>
</xsl:for-each>
</xsl:template>
<!-- field to delete -->
<xsl:template match="isMemberOf | rights[@typeLabel='Usage']"/>
</xsl:stylesheet>
{
"title": "Grabung Castaneda (Siedlung)",
"SerieTitle": "Grabung Walo Burkart und Karl Keller-Tarnuzzer, Castaneda (Gräberfeld und Siedlung)",
"creatorPerson": {
"name": "[unbekannt]"
},
"Keywords": "Ausgrabung, Archäologie",
"Abstract": "Feld 1, östlicher Teil von Westen",
"Claim": "Dieses Dokument wurde Dank der Unterstützung von Memoriav erhalten.",
"RecordingLocation": "Castaneda",
"contributorCorporateBody": {
"name": "ADG"
},
"genre": "Grabungsfotografie / Arbeitsfotografie / Sachfotografie",
"objectType": "photograph",
"medium": "[keine Information vorhanden]",
"imageFormatColorMode": "sw",
"imageFormatRemarks": "Album-Nr.: A2",
"Original": "102821",
"CallNumber": "RM_1_33_6",
"Main": "ADG-102821",
"location": "Schweiz, Graubünden, Castaneda",
"rightsHolder": "Archäologischer Dienst Graubünden",
"accessPhsyical": "onsite",
"accessDigital": "public"
}
\ No newline at end of file
This diff is collapsed.
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
version="2.0"
xmlns:ns2="http://purl.org/dc/elements/1.1/"
xmlns:foxml="info:fedora/fedora-system:def/foxml#"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:audit="info:fedora/fedora-system:def/audit#"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
xmlns:fedora="info:fedora/fedora-system:def/relations-external#"
xmlns:fedora-model="info:fedora/fedora-system:def/model#"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xslt="http://www.w3.org/1999/XSL/Transform">
<xsl:output
indent="no"
method="xml"
/>
<!--<xsl:template match="/digitalObject/datastream/datastreamVersion/xmlContent/ebuCoreMain/coreMetadata">
<xsl:copy-of select="."/>
</xsl:template>-->
<xsl:template match="/">
<record>
<xsl:apply-templates />
</record>
<!--<xsl:element name="record">
<xsl:apply-templates select="test"/>
<xsl:apply-templates select="title, ns2:title, alternativeTitle, subject, description, format/essenceLocator, format/medium, type/*, references"/>
</xsl:element>-->
</xsl:template>
<!-- content of child node is copied to parent node -->
<xsl:template match="title | format/essenceLocator | format/duration | format/start | language | references | coverage/spatial/location">
<xsl:element name="{local-name()}">
<xsl:value-of select="child::*"/>
</xsl:element>
</xsl:template>
<!-- content of typeLabel is transformed to an element name and content of child node is copied to it-->
<xsl:template match="description[@typeLabel] | alternativeTitle[@typeLabel] | subject[@typeLabel]">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="{$typeLabel}">
<xsl:value-of select="child::*"/>
</xsl:element>
</xsl:template>
<!-- content of typeLabel is copied as content of the node -->
<xsl:template match="format/medium | format/dataFormat/captioningFormat | type/*">
<xsl:element name="{local-name()}">
<xsl:value-of select="@typeLabel | @language"/>
</xsl:element>
</xsl:template>
<xsl:template match="identifier">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="identifier{$typeLabel}">
<xsl:value-of select="child::ns2:identifier"/>
</xsl:element>
</xsl:template>
<xsl:template match="format/videoFormat | format/imageFormat | format/audioFormat">
<xsl:variable name="format" select="local-name()"/>
<xsl:for-each select="technicalAttributeString">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="{$format}{$typeLabel}">
<xsl:value-of select="."/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<!-- ToDo: role auslesen und dabei besitzende Insitution ausschliessen, producer auf Grund Rolle in eigenes Feld -->
<xsl:template match="contributor | creator | publisher">
<xsl:variable name="type" select="local-name()"/>
<xsl:for-each select=".">
<xsl:choose>
<xsl:when test="child::organisationDetails">
<xsl:element name="{$type}CorporateBody">
<xsl:element name="name">
<xsl:value-of select="descendant::organisationName"/>
</xsl:element>
<xsl:if test="child::role">
<xsl:element name="role">
<xsl:value-of select="child::role[@typeLabel]"/> <!-- funktioniert noch nicht -->
</xsl:element>
</xsl:if>
</xsl:element>
</xsl:when>
<xsl:when test="child::contactDetails">
<xsl:element name="{$type}Person">
<xsl:element name="name">
<xsl:value-of select="descendant::name"/>
</xsl:element>
<xsl:if test="child::role">
<xsl:element name="role">
<xsl:value-of select="child::role[@typeLabel]"/> <!-- funktioniert noch nicht -->
</xsl:element>
</xsl:if>
</xsl:element>
</xsl:when>
</xsl:choose>
</xsl:for-each>
</xsl:template>
<xsl:template match="rights[@typeLabel='Access']/ns2:rights">
<xsl:if test="matches(.,'onsite')">
<xsl:element name="accessPhsyical">
<xsl:text>onsite</xsl:text>
</xsl:element>
</xsl:if>