Commit 7223ee39 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

First implementation for xml data transform

parent 5fa97705
...@@ -32,31 +32,24 @@ ext { ...@@ -32,31 +32,24 @@ ext {
} }
dependencies { dependencies {
// https://mvnrepository.com/artifact/org.elasticsearch.client/elasticsearch-rest-high-level-client
//compile group: 'org.elasticsearch.client', name: 'elasticsearch-rest-high-level-client', version: '7.1.0'
// Logging Framework // Logging Framework
implementation "org.apache.logging.log4j:log4j-api:${log4jV}" implementation "org.apache.logging.log4j:log4j-api:${log4jV}"
implementation "org.apache.logging.log4j:log4j-core:${log4jV}" implementation "org.apache.logging.log4j:log4j-core:${log4jV}"
implementation "org.apache.logging.log4j:log4j-slf4j-impl:${log4jV}" implementation "org.apache.logging.log4j:log4j-slf4j-impl:${log4jV}"
// Kafka Imports
implementation group: 'org.apache.kafka', name: 'kafka-clients', version: kafkaV
implementation "org.apache.kafka:kafka-streams:${kafkaV}" implementation "org.apache.kafka:kafka-streams:${kafkaV}"
implementation 'org.memobase:memobase-service-utilities:1.4.0' implementation 'org.memobase:memobase-service-utilities:1.7.1'
// CSV Reader
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:0.7.3") // https://mvnrepository.com/artifact/net.sf.saxon/Saxon-HE
// XSLX / XSL Reader compile group: 'net.sf.saxon', name: 'Saxon-HE', version: '9.9.1-7'
implementation 'org.apache.poi:poi:4.1.2'
implementation 'org.apache.poi:poi-ooxml:4.1.2' // used by saxon library
// ODS Reader // https://mvnrepository.com/artifact/com.ibm.icu/icu4j
implementation 'org.odftoolkit:odftoolkit:1.0.0-BETA1' compile group: 'com.ibm.icu', name: 'icu4j', version: '67.1'
// JSON Parser // JSON Parser
implementation 'com.beust:klaxon:5.2' implementation 'com.beust:klaxon:5.2'
// Compression
//implementation "org.apache.commons:commons-compress:1.19"
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8' implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'
implementation "org.jetbrains.kotlin:kotlin-script-runtime:1.3.71" implementation "org.jetbrains.kotlin:kotlin-script-runtime:1.3.71"
......
/* /*
* Table Data Import Service * XML Data Import Service
* Copyright (C) 2020 Memoriav * Copyright (C) 2020 Memoriav
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
...@@ -24,37 +24,16 @@ import org.apache.kafka.streams.StreamsBuilder ...@@ -24,37 +24,16 @@ import org.apache.kafka.streams.StreamsBuilder
import org.apache.kafka.streams.Topology import org.apache.kafka.streams.Topology
import org.apache.kafka.streams.kstream.KStream import org.apache.kafka.streams.kstream.KStream
import org.apache.kafka.streams.kstream.Predicate import org.apache.kafka.streams.kstream.Predicate
import org.apache.logging.log4j.LogManager
import org.memobase.settings.SettingsLoader import org.memobase.settings.SettingsLoader
import org.memobase.sftp.SftpClient import org.memobase.sftp.SftpClient
import java.io.File import java.io.File
import java.io.FileInputStream
import java.io.InputStream import java.io.InputStream
import java.io.StringReader import java.io.StringReader
import javax.xml.transform.TransformerFactory
import javax.xml.transform.sax.SAXResult
import javax.xml.transform.stream.StreamSource
import kotlin.system.exitProcess
class KafkaTopology(private val settings: SettingsLoader) { class KafkaTopology(private val settings: SettingsLoader) {
private val log = LogManager.getLogger("KafkaTopologySetup")
private val sftpClient: SftpClient = SftpClient(settings.sftpSettings) private val sftpClient: SftpClient = SftpClient(settings.sftpSettings)
private val xlstFilePath = settings.appSettings.getProperty("xsltFilePath") private val xmlTransformer = XMLTransformer(settings.appSettings)
private val identifierFieldName = settings.appSettings.getProperty("identifierFieldName")
private val recordTag = settings.appSettings.getProperty("recordTag")
init {
if (identifierFieldName == "placeholderValue") {
log.error("Requires a value for identifier field name, but found default value.")
exitProcess(1)
}
}
private val factory = TransformerFactory.newInstance()
private val xslt = StreamSource(FileInputStream(File(xlstFilePath)))
private val transformer = factory.newTransformer(xslt)
private val reportingTopic = settings.outputTopic + "-reporting" private val reportingTopic = settings.outputTopic + "-reporting"
fun build(): Topology { fun build(): Topology {
...@@ -131,11 +110,7 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -131,11 +110,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
} }
private fun transformXml(key: String, data: InputStream): KeyValue<String, SAXContentHandler> { private fun transformXml(key: String, data: InputStream): KeyValue<String, SAXContentHandler> {
val contentHandler = SAXContentHandler(key, identifierFieldName, recordTag) return xmlTransformer.applyXSLT(key, data)
data.use {
transformer.transform(StreamSource(it), SAXResult(contentHandler))
}
return KeyValue(contentHandler.identifier, contentHandler)
} }
......
...@@ -19,12 +19,15 @@ ...@@ -19,12 +19,15 @@
package org.memobase package org.memobase
import com.beust.klaxon.JsonObject import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.xml.sax.Attributes import org.xml.sax.Attributes
import org.xml.sax.ContentHandler import org.xml.sax.ContentHandler
import org.xml.sax.Locator import org.xml.sax.Locator
import java.io.StringWriter import java.io.StringWriter
class SAXContentHandler(key: String, private val identifierFieldName: String, private val recordTag: String) : ContentHandler { class SAXContentHandler(key: String, private val identifierFieldName: String, private val recordTag: String) : ContentHandler {
private val log = LogManager.getLogger("SAXHandler")
val output = StringWriter() val output = StringWriter()
var identifier: String = key var identifier: String = key
var report: Report? = null var report: Report? = null
...@@ -70,10 +73,19 @@ class SAXContentHandler(key: String, private val identifierFieldName: String, pr ...@@ -70,10 +73,19 @@ class SAXContentHandler(key: String, private val identifierFieldName: String, pr
override fun characters(characters: CharArray?, start: Int, size: Int) { override fun characters(characters: CharArray?, start: Int, size: Int) {
if (characters != null) { if (characters != null) {
if (currentInnerElementTag != "") { val line = characters.joinToString("").trim()
currentInnerElementContent += characters.toString() if (line.isNotEmpty()) {
} else if (currentElementTag != "") { when {
currentElementContent += characters.toString() currentInnerElementTag != "" -> {
currentInnerElementContent += line
}
currentElementTag != "" -> {
currentElementContent += line
}
else -> {
log.error(line)
}
}
} }
} }
} }
...@@ -109,14 +121,19 @@ class SAXContentHandler(key: String, private val identifierFieldName: String, pr ...@@ -109,14 +121,19 @@ class SAXContentHandler(key: String, private val identifierFieldName: String, pr
identifier = currentElementContent identifier = currentElementContent
} }
if (innerElements.isEmpty()) { if (innerElements.isEmpty()) {
jsonResult[currentElementTag] = currentElementContent if (currentElementContent.isNotEmpty()) {
currentElementContent = "" jsonResult[currentElementTag] = currentElementContent
}
} else { } else {
jsonResult[currentElementTag] = innerElements.toMap() jsonResult[currentElementTag] = innerElements.toMap()
innerElements.clear() innerElements.clear()
} }
currentElementTag = ""
currentElementContent = ""
} else if (currentInnerElementTag == localName) { } else if (currentInnerElementTag == localName) {
innerElements.add(Pair(currentInnerElementTag, currentInnerElementContent)) if (currentInnerElementContent.isNotEmpty()) {
innerElements.add(Pair(currentInnerElementTag, currentInnerElementContent))
}
currentInnerElementTag = "" currentInnerElementTag = ""
currentInnerElementContent = "" currentInnerElementContent = ""
} else { } else {
......
/* /*
* text-file-validation * XML Data Transform Service
* Copyright (C) 2020 Memoriav * Copyright (C) 2020 Memoriav
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
......
/*
* xml-data-transform
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import net.sf.saxon.s9api.Processor
import net.sf.saxon.s9api.SAXDestination
import net.sf.saxon.s9api.StaticError
import net.sf.saxon.s9api.XsltExecutable
import org.apache.kafka.streams.KeyValue
import org.apache.logging.log4j.LogManager
import java.io.File
import java.io.FileInputStream
import java.io.InputStream
import java.util.Properties
import javax.xml.transform.stream.StreamSource
class XMLTransformer(appSettings: Properties) {
private val log = LogManager.getLogger("XMLTransformer")
private val xlstFilePath = appSettings.getProperty("xsltFilePath")
private val identifierFieldName = appSettings.getProperty("identifierFieldName")
private val recordTag = appSettings.getProperty("recordTag")
init {
if (identifierFieldName == "placeholderValue") {
log.error("Requires a value for identifier field name, but found default value.")
}
}
private val processor = Processor(false)
private val xslt = compileXslt()
private val transformer = xslt.load()
private fun compileXslt(): XsltExecutable {
val errorList = mutableListOf<StaticError>()
val xsltCompiler = processor.newXsltCompiler()
xsltCompiler.setErrorList(errorList)
val source = StreamSource(FileInputStream(File(xlstFilePath)))
val executable = xsltCompiler.compile(source)
if (errorList.isEmpty()) {
return executable
} else {
throw Exception(errorList.joinToString())
}
}
fun applyXSLT(key: String, data: InputStream): KeyValue<String, SAXContentHandler> {
val contentHandler = SAXContentHandler(key, identifierFieldName, recordTag)
data.use {
transformer.setSource(StreamSource(it))
transformer.destination = SAXDestination(contentHandler)
transformer.transform()
}
if (contentHandler.identifier.isEmpty()) {
throw Exception("No valid identifier found in record $key in field $identifierFieldName.")
} else {
return KeyValue(contentHandler.identifier, contentHandler)
}
}
}
\ No newline at end of file
...@@ -29,10 +29,13 @@ import org.apache.kafka.streams.TopologyTestDriver ...@@ -29,10 +29,13 @@ import org.apache.kafka.streams.TopologyTestDriver
import org.apache.kafka.streams.test.ConsumerRecordFactory import org.apache.kafka.streams.test.ConsumerRecordFactory
import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.LogManager
import org.assertj.core.api.Assertions.assertThat import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.TestInstance import org.junit.jupiter.api.TestInstance
import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.MethodSource import org.junit.jupiter.params.provider.MethodSource
import org.memobase.testing.EmbeddedSftpServer import org.memobase.testing.EmbeddedSftpServer
import java.io.InputStream
import java.util.Properties
@TestInstance(TestInstance.Lifecycle.PER_CLASS) @TestInstance(TestInstance.Lifecycle.PER_CLASS)
class Tests { class Tests {
...@@ -43,6 +46,10 @@ class Tests { ...@@ -43,6 +46,10 @@ class Tests {
return File("$resourcePath/$fileName").readText(Charset.defaultCharset()) return File("$resourcePath/$fileName").readText(Charset.defaultCharset())
} }
private fun createInputStream(fileName: String): InputStream {
return File("src/test/resources/$fileName").inputStream()
}
private val sftpServer = EmbeddedSftpServer(22000, "user", "password") private val sftpServer = EmbeddedSftpServer(22000, "user", "password")
init { init {
...@@ -56,6 +63,20 @@ class Tests { ...@@ -56,6 +63,20 @@ class Tests {
} }
} }
@Test
fun `test xslt tranform`() {
val props = Properties()
props.setProperty("xsltFilePath", "src/test/resources/test-transformer.xslt")
props.setProperty("identifierFieldName", "identifierMain")
props.setProperty("recordTag", "record")
val transformer = XMLTransformer(props)
val result = transformer.applyXSLT("", createInputStream("test-transformer.xml"))
assertThat(result.key)
.isEqualTo("ADG-102821")
assertThat(result.value.output)
.isEqualTo("")
}
/*
@ParameterizedTest @ParameterizedTest
@MethodSource("testParams") @MethodSource("testParams")
fun `test inputs`(params: TestParams) { fun `test inputs`(params: TestParams) {
...@@ -117,5 +138,5 @@ class Tests { ...@@ -117,5 +138,5 @@ class Tests {
Report("", "", ""), Report("", "", ""),
Report("", "", "") Report("", "", "")
) )
) )*/
} }
<?xml version="1.0"?> <?xml version="1.0" encoding="UTF-8"?>
<!-- <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
~ xml-data-transform xmlns:fn="http://www.w3.org/2005/xpath-functions"
~ Copyright (C) 2020 Memoriav version="2.0"
~ xmlns:ns2="http://purl.org/dc/elements/1.1/"
~ This program is free software: you can redistribute it and/or modify xmlns:foxml="info:fedora/fedora-system:def/foxml#"
~ it under the terms of the GNU Affero General Public License as published by xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
~ the Free Software Foundation, either version 3 of the License, or xmlns:audit="info:fedora/fedora-system:def/audit#"
~ (at your option) any later version. xmlns:dc="http://purl.org/dc/elements/1.1/"
~ xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
~ This program is distributed in the hope that it will be useful, xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
~ but WITHOUT ANY WARRANTY; without even the implied warranty of xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the xmlns:fedora="info:fedora/fedora-system:def/relations-external#"
~ GNU Affero General Public License for more details. xmlns:fedora-model="info:fedora/fedora-system:def/model#"
~ xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xslt="http://www.w3.org/1999/XSL/Transform">
~ You should have received a copy of the GNU Affero General Public License
~ along with this program. If not, see <https://www.gnu.org/licenses/>. <xsl:output
--> indent="yes"
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> method="xml"
<xsl:template name="root"> />
<creator>
</creator> <!--<xsl:template match="/digitalObject/datastream/datastreamVersion/xmlContent/ebuCoreMain/coreMetadata">
</xsl:template> <xsl:copy-of select="."/>
</xsl:stylesheet> </xsl:template>-->
\ No newline at end of file
<xsl:template match="coreMetadata">
<xsl:element name="record">
<!--<xsl:apply-templates select="test"/>
<xsl:apply-templates select="title, ns2:title, alternativeTitle, subject, description, format/essenceLocator, format/medium, type/*, references"/>-->
<xsl:apply-templates />
</xsl:element>
</xsl:template>
<!-- content of child node is copied to parent node -->
<xsl:template match="title | format/essenceLocator | format/duration | format/start | language | references | coverage/spatial/location">
<xsl:element name="{local-name()}">
<xsl:value-of select="child::*"/>
</xsl:element>
</xsl:template>
<!-- content of typeLabel is transformed to an element name and content of child node is copied to it-->
<xsl:template match="description[@typeLabel] | alternativeTitle[@typeLabel] | subject[@typeLabel]">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="{$typeLabel}">
<xsl:value-of select="child::*"/>
</xsl:element>
</xsl:template>
<!-- content of typeLabel is copied as content of the node -->
<xsl:template match="format/medium | format/dataFormat/captioningFormat | type/*">
<xsl:element name="{local-name()}">
<xsl:value-of select="@typeLabel | @language"/>
</xsl:element>
</xsl:template>
<xsl:template match="identifier">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="{$typeLabel}">
<xsl:value-of select="child::ns2:identifier"/>
</xsl:element>
</xsl:template>
<xsl:template match="format/videoFormat | format/imageFormat | format/audioFormat">
<xsl:variable name="format" select="local-name()"/>
<xsl:for-each select="technicalAttributeString">
<xsl:variable name="typeLabel" select="@typeLabel"/>
<xsl:element name="{$format}{$typeLabel}">
<xsl:value-of select="."/>
</xsl:element>
</xsl:for-each>
</xsl:template>
<!-- ToDo: role auslesen und dabei besitzende Insitution ausschliessen, producer auf Grund Rolle in eigenes Feld -->
<xsl:template match="contributor | creator | publisher">
<xsl:variable name="type" select="local-name()"/>
<xsl:for-each select=".">
<xsl:choose>
<xsl:when test="child::organisationDetails">
<xsl:element name="{$type}CorporateBody">
<xsl:element name="name">
<xsl:value-of select="descendant::organisationName"/>
</xsl:element>
<xsl:if test="child::role">
<xsl:element name="role">
<xsl:value-of select="child::role[@typeLabel]"/> <!-- funktioniert noch nicht -->
</xsl:element>
</xsl:if>
</xsl:element>
</xsl:when>
<xsl:when test="child::contactDetails">
<xsl:element name="{$type}Person">
<xsl:element name="name">
<xsl:value-of select="descendant::name"/>
</xsl:element>
<xsl:if test="child::role">
<xsl:element name="role">
<xsl:value-of select="child::role[@typeLabel]"/> <!-- funktioniert noch nicht -->
</xsl:element>
</xsl:if>
</xsl:element>
</xsl:when>
</xsl:choose>
</xsl:for-each>
</xsl:template>
<xsl:template match="rights[@typeLabel='Access']/ns2:rights">
<xsl:if test="matches(.,'onsite')">
<xsl:element name="accessPhsyical">
<xsl:text>onsite</xsl:text>
</xsl:element>
</xsl:if>
<xsl:if test="matches(.,'noonsite')">
<xsl:element name="accessDigital">
<xsl:text>noonsite</xsl:text>
</xsl:element>
</xsl:if>
<xsl:if test="matches(.,'public')">
<xsl:element name="accessDigital">
<xsl:text>public</xsl:text>
</xsl:element>
</xsl:if>
<xsl:if test="matches(.,'private')">
<xsl:element name="accessDigital">
<xsl:text>private</xsl:text>
</xsl:element>
</xsl:if>
<xsl:if test="matches(.,'faro')">
<xsl:element name="accessDigital">
<xsl:text>faro</xsl:text>
</xsl:element>
</xsl:if>
</xsl:template>
<xsl:template match="rights[@typeLabel='Holder']/rightsHolder/contactDetails/name">
<xsl:element name="rightsHolder">
<xsl:value-of select="."/>
</xsl:element>
</xsl:template>
<xsl:template match="date">
<xsl:for-each select=".">
<xsl:choose>
<xsl:when test="child::created">
<xsl:element name="dateCreated">
<xsl:call-template name="dates"/>
</xsl:element>
</xsl:when>
<xsl:when test="child::issued">
<xsl:element name="dateIssued">
<xsl:call-template name="dates"/>
</xsl:element>
</xsl:when>
</xsl:choose>
</xsl:for-each>
</xsl:template>
<!-- ToDo: ausbauen für alle attributes von date -->
<xsl:template name="dates">
<xsl:for-each select="child::*">
<xsl:value-of select="@startDate"/>
</xsl:for-each>
</xsl:template>
<!-- field to delete -->
<xsl:template match="isMemberOf | rights[@typeLabel='Usage']"/>
</xsl:stylesheet>
{
"title": "Grabung Castaneda (Siedlung)",
"SerieTitle": "Grabung Walo Burkart und Karl Keller-Tarnuzzer, Castaneda (Gräberfeld und Siedlung)",
"creatorPerson": {
"name": "[unbekannt]"
},
"Keywords": "Ausgrabung, Archäologie",
"Abstract": "Feld 1, östlicher Teil von Westen",
"Claim": "Dieses Dokument wurde Dank der Unterstützung von Memoriav erhalten.",
"RecordingLocation": "Castaneda",
"contributorCorporateBody": {
"name": "ADG"
},
"genre": "Grabungsfotografie / Arbeitsfotografie / Sachfotografie",
"objectType": "photograph",
"medium": "[keine Information vorhanden]",
"imageFormatColorMode": "sw",
"imageFormatRemarks": "Album-Nr.: A2",
"Original": "102821",
"CallNumber": "RM_1_33_6",
"Main": "ADG-102821",
"location": "Schweiz, Graubünden, Castaneda",
"rightsHolder": "Archäologischer Dienst Graubünden",
"accessPhsyical": "onsite",
"accessDigital": "public"
}
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ xml-data-transform
~ Copyright (C) 2020 Memoriav
~
~ This program is free software: you can redistribute it and/or modify
~ it under the terms of the GNU Affero General Public License as published by
~ the Free Software Foundation, either version 3 of the License, or
~ (at your option) any later version.
~
~ This program is distributed in the hope that it will be useful,
~ but WITHOUT ANY WARRANTY; without even the implied warranty of
~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
~ GNU Affero General Public License for more details.
~
~ You should have received a copy of the GNU Affero General Public License
~ along with this program. If not, see <https://www.gnu.org/licenses/>.
-->