Commit 3e30cb56 authored by Jonas Waeber's avatar Jonas Waeber

Refactor transformation implementation

parent e546e960
package org.memobase
import org.apache.logging.log4j.LogManager
import org.memobase.model.Institution
import org.memobase.model.Municipality
import kotlin.system.exitProcess
object MunicipalitiesLoader {
object Helpers {
private val log = LogManager.getLogger("MunicipalitiesLoader")
private val log = LogManager.getLogger("Helper")
fun getMunicipalities(): Map<String, Municipality> {
val stream = ClassLoader.getSystemResourceAsStream("municipalities.tsv")
......
......@@ -18,43 +18,67 @@
package org.memobase
import com.beust.klaxon.JsonObject
import com.beust.klaxon.Klaxon
import com.beust.klaxon.KlaxonException
import org.apache.kafka.streams.StreamsBuilder
import org.apache.kafka.streams.Topology
import org.apache.logging.log4j.LogManager
import org.memobase.model.LanguageString
import org.memobase.model.LanguageVariants
import org.memobase.model.MergedAddress
import org.memobase.model.MergedInstitution
import org.memobase.settings.SettingsLoader
import java.io.StringReader
class KafkaTopology(private val settings: SettingsLoader) {
private val log = LogManager.getLogger("StreamsProcessing")
private val municipalities = MunicipalitiesLoader.getMunicipalities()
private val municipalities = Helpers.getMunicipalities()
fun build(): Topology {
val builder = StreamsBuilder()
val stream = builder.stream<String, String>(settings.inputTopic)
stream
.flatMapValues { value -> parseJson(value) }
.flatMapValues { value -> value.keys.map { key -> Pair(value[key] as JsonObject, key) } }
.mapValues { value -> transformJson(value.first, value.second) }
.mapValues { value -> mergeTranslations(value) }
.mapValues { value -> transformJson(value) }
.map { _, value -> value.write() }
.to(settings.outputTopic)
return builder.build()
}
private fun parseJson(data: String): List<JsonObject> {
private fun parseJson(data: String): List<LanguageVariants> {
return try {
listOf(Klaxon().parseJsonObject(StringReader(data)))
val int = Klaxon().parse<LanguageVariants>(StringReader(data))
return if (int == null) emptyList()
else listOf(int)
} catch (ex: KlaxonException) {
log.error("Failed to parse source: $data.")
emptyList()
}
}
private fun transformJson(input: JsonObject, language: String): Transform {
return Transform(municipalities).createInstitution(input, language)
private fun mergeTranslations(input: LanguageVariants): MergedInstitution {
return MergedInstitution(
input.de.field_memobase_id,
LanguageString(input.de.title, input.fr.title, input.it.title),
LanguageString(input.de.field_text.value, input.fr.field_text.value, input.it.field_text.value),
input.de.field_isil,
input.de.field_email,
input.de.field_link_archive_catalog.uri,
input.de.field_website.uri,
input.de.field_address.mapIndexed { index, address ->
MergedAddress(
LanguageString(address.address_line1, input.fr.field_address[index].address_line1, input.it.field_address[index].address_line1),
LanguageString(address.address_line2, input.fr.field_address[index].address_line2, input.it.field_address[index].address_line2),
address.postal_code
)
}
)
}
private fun transformJson(input: MergedInstitution): Transform {
return Transform(municipalities).createInstitution(input)
}
}
package org.memobase
import com.beust.klaxon.JsonObject
import org.apache.jena.rdf.model.Literal
import org.apache.jena.rdf.model.ModelFactory
import org.apache.jena.rdf.model.Property
import org.apache.jena.rdf.model.Resource
import org.apache.jena.riot.RDFDataMgr
import org.apache.jena.riot.RDFFormat
import org.apache.kafka.streams.KeyValue
import org.apache.logging.log4j.LogManager
import org.memobase.rdf.*
import org.memobase.model.MergedAddress
import org.memobase.model.MergedInstitution
import org.memobase.model.Municipality
import org.memobase.rdf.NS
import org.memobase.rdf.RDF
import org.memobase.rdf.RICO
import org.memobase.rdf.WD
import java.io.StringWriter
import kotlin.system.exitProcess
class Transform(private val municipalities: Map<String, Municipality>) {
private val log = LogManager.getLogger("Transform")
private val model = ModelFactory.createDefaultModel()
private var resource: Resource? = null
private var uri: String = ""
fun createInstitution(source: JsonObject, language: String): Transform {
val id = source["field_memobase_id"].let {
if (it is String) {
it
} else {
log.error("No field memobase id defined.")
exitProcess(1)
}
}
val resource = model.createResource(NS.memint + id)
uri = resource.uri
val identifier = model.createResource()
identifier.addProperty(RDF.type, RICO.Identifier)
identifier.addProperty(RICO.type, literal("main"))
identifier.addProperty(RICO.identifier, literal(id))
resource.addProperty(RICO.identifiedBy, identifier)
// TODO: proper multi language integration!
resource.addProperty(RICO.name, langLiteral(source["field_name"] as String, language))
resource.addProperty(RICO.descriptiveNote, langLiteral(source["field_text"] as String, language))
source["field_addresses"].let { fieldAddressValue ->
try {
fieldAddressValue as List<JsonObject>
fieldAddressValue.forEach { fieldAddress ->
extractAddressField(resource, fieldAddress)
fun createInstitution(input: MergedInstitution): Transform {
resource = model.createResource(NS.memint + input.id)
resource.let { valResource ->
if (valResource != null) {
uri = valResource.uri
val identifier = model.createResource()
identifier.addProperty(RDF.type, RICO.Identifier)
identifier.addProperty(RICO.type, literal("main"))
identifier.addProperty(RICO.identifier, literal(input.id))
valResource.addProperty(RICO.identifiedBy, identifier)
// TODO: proper multi language integration!
valResource.addProperty(RICO.name, langLiteral(input.name.de, "de"))
valResource.addProperty(RICO.name, langLiteral(input.name.fr, "fr"))
valResource.addProperty(RICO.name, langLiteral(input.name.it, "it"))
valResource.addProperty(RICO.descriptiveNote, langLiteral(input.description.de, "de"))
valResource.addProperty(RICO.descriptiveNote, langLiteral(input.description.fr, "fr"))
valResource.addProperty(RICO.descriptiveNote, langLiteral(input.description.it, "it"))
input.addresses.forEach {
extractAddressField(valResource, it)
}
} catch (ex: ClassCastException) {
log.warn("Could not cast field_addresses to JsonObject: $fieldAddressValue.")
valResource.addProperty(WD.isil, literal(input.isil))
valResource.addProperty(WD.emailAddress, literal(input.contactEmail))
valResource.addProperty(WD.website, literal(input.website))
valResource.addProperty(WD.onlineArchive, literal(input.onlineCatalogueLink))
}
}
extractSimpleField(resource, WD.isil, source, "field_isil")
extractSimpleField(resource, WD.website, source, "field_website")
extractSimpleField(resource, WD.emailAddress, source, "field_email")
extractSimpleField(resource, WD.onlineArchive, source, "field_online_archive")
extractSimpleField(resource, SCHEMA.sameAs, source, "wikidata_id")
extractSimpleField(resource, WD.image, source, "image")
extractSimpleField(resource, WD.logo, source, "logo")
extractSimpleField(resource, WD.typeOfInstitution, source, "instance_of")
return this
}
......@@ -70,29 +66,25 @@ class Transform(private val municipalities: Map<String, Municipality>) {
}
}
private fun extractSimpleField(resource: Resource, property: Property, source: JsonObject, fieldName: String) {
source[fieldName].let {
if (it is String) {
resource.addProperty(property, literal(it))
private fun extractAddressField(resource: Resource, mergedAddress: MergedAddress) {
val location = model.createResource()
listOf("de", "fr", "it").forEach {
val streetAddress = mergedAddress.addressLine1.get(it)
val secondAddressLine = mergedAddress.addressLine2.get(it)
val combinedStreetAddress = if (secondAddressLine.isNotEmpty()) {
streetAddress + "\n" + secondAddressLine
} else {
log.warn("No value for $fieldName found in source for institution $uri.")
streetAddress
}
val streetNumber = streetAddress.substringAfterLast(" ")
val street = streetAddress.replace(streetNumber, "").trim()
location.addProperty(WD.street, langLiteral(street, it))
location.addProperty(WD.streetNumber, literal(streetNumber))
location.addProperty(WD.streetAddress, langLiteral(combinedStreetAddress, it))
}
}
private fun extractAddressField(resource: Resource, fieldAddress: JsonObject) {
val location = model.createResource()
val streetAddress = fieldAddress["address_line1"] as String
val secondAddressLine = fieldAddress["address_line2"] as String?
val combinedStreetAddress = if (secondAddressLine != null) {
streetAddress + "\n" + secondAddressLine
} else {
streetAddress
}
val streetNumber = streetAddress.substringAfterLast(" ")
val street = streetAddress.replace(streetNumber, "").trim()
val postalCode = (fieldAddress["postal_code"] as String).trim()
val postalCode = mergedAddress.postalCode.trim()
val municipality = if (municipalities.containsKey(postalCode)) {
municipalities[postalCode]
} else {
......@@ -101,9 +93,6 @@ class Transform(private val municipalities: Map<String, Municipality>) {
null
}
location.addProperty(RDF.type, WD.location)
location.addProperty(WD.street, literal(street))
location.addProperty(WD.streetNumber, literal(streetNumber))
location.addProperty(WD.streetAddress, literal(combinedStreetAddress))
location.addProperty(WD.postalCode, literal(postalCode))
// does not enrich city, canton or cantons, if the postal code is not in the list.
if (municipality != null) {
......@@ -116,7 +105,6 @@ class Transform(private val municipalities: Map<String, Municipality>) {
location.addProperty(WD.coordinates, literal(coordinate))
}
}
//val country = it["country_code"] as String
// country is currently hard coded to switzerland!
location.addProperty(WD.country, WD.switzerland)
resource.addProperty(WD.streetAddress, location)
......
package org.memobase.model
data class Address(
val address_line1: String,
val address_line2: String,
val postal_code: String
)
\ No newline at end of file
package org.memobase.model
data class Institution(
val field_memobase_id: String,
val title: String,
val field_text: RichText,
val field_isil: String,
val field_email: String,
val field_link_archive_catalog: Link,
val field_website: Link,
val field_address: List<Address>
)
\ No newline at end of file
package org.memobase.model
data class LanguageString(
val de: String,
val fr: String,
val it: String
) {
fun get(lang: String): String {
return when (lang) {
"de" -> de
"fr" -> fr
else -> it
}
}
}
\ No newline at end of file
package org.memobase.model
class LanguageVariants(
val de: Institution,
val fr: Institution,
val it: Institution
)
\ No newline at end of file
package org.memobase.model
data class Link(
val uri: String
)
\ No newline at end of file
package org.memobase.model
data class MergedAddress(
val addressLine1: LanguageString,
val addressLine2: LanguageString,
val postalCode: String
)
\ No newline at end of file
package org.memobase.model
data class MergedInstitution(
internal val id: String,
val name: LanguageString,
val description: LanguageString,
val isil: String,
val contactEmail: String,
val onlineCatalogueLink: String,
val website: String,
val addresses: List<MergedAddress>
)
\ No newline at end of file
package org.memobase
package org.memobase.model
data class Municipality(
val postalCodes: List<String>,
......
package org.memobase.model
data class RichText(
val value: String
)
\ No newline at end of file
......@@ -39,7 +39,7 @@ class Test {
@Test
fun `test municipalities loader`() {
val result = MunicipalitiesLoader.getMunicipalities()
val result = Helpers.getMunicipalities()
assertThat(result)
.isNotNull
.isNotEmpty
......
{
"de": {
"title": "Association Films Plans-Fixes",
"field_memobase_id": "test",
"field_isil": "12345"
"field_isil": "12345",
"field_address": [
{
"langcode": "",
"country_code": "CH",
"administrative_area": null,
"locality": "Lausanne",
"dependent_locality": null,
"postal_code": "2502",
"sorting_code": null,
"address_line1": "Straßenname 12",
"address_line2": "Zusätzliche Adressinformationen",
"organization": "Association Films Plans-Fixes",
"given_name": null,
"additional_name": null,
"family_name": null
},
{
"langcode": "",
"country_code": "DE",
"administrative_area": null,
"locality": "Berlin",
"dependent_locality": null,
"postal_code": "2502",
"sorting_code": null,
"address_line1": " Ostseestraße 107",
"address_line2": "",
"organization": "",
"given_name": null,
"additional_name": null,
"family_name": null
}
],
"field_email": "info@plans-fixess.ch",
"field_link_archive_catalog": {
"uri": "http://www.plansfixes.ch/indexation/",
"title": "",
"options": [
]
},
"field_text": {
"value": "<p>Association Films Plans-Fixes</p>\r\n",
"format": "editorial",
"processed": "Association Films Plans-Fixes"
},
"field_website": {
"uri": "http://www.plansfixes.ch/",
"title": "",
"options": [
]
}
},
"fr": {
"field_memobase_id": "test"
"title": "Association Films Plans-Fixes",
"field_memobase_id": "test",
"field_address": [
{
"langcode": "",
"country_code": "CH",
"administrative_area": null,
"locality": "Lausanne",
"dependent_locality": null,
"postal_code": "2502",
"sorting_code": null,
"address_line1": "Straßenname 12",
"address_line2": "Zusätzliche Adressinformationen",
"organization": "Association Films Plans-Fixes",
"given_name": null,
"additional_name": null,
"family_name": null
},
{
"langcode": "",
"country_code": "DE",
"administrative_area": null,
"locality": "Berlin",
"dependent_locality": null,
"postal_code": "2502",
"sorting_code": null,
"address_line1": " Ostseestraße 107",
"address_line2": "",
"organization": "",
"given_name": null,
"additional_name": null,
"family_name": null
}
],
"field_email": "info@plans-fixess.ch",
"field_isil": "PlansFixes-All",
"field_link_archive_catalog": {
"uri": "http://www.plansfixes.ch/indexation/",
"title": "",
"options": [
]
},
"field_text": {
"value": "<p>Association Films Plans-Fixes</p>\r\n",
"format": "editorial",
"processed": "Association Films Plans-Fixes"
},
"field_website": {
"uri": "http://www.plansfixes.ch/",
"title": "",
"options": [
]
}
},
"it": {
"field_memobase_id": "test"
"title": "Association Films Plans-Fixes",
"field_memobase_id": "test",
"field_address": [
{
"langcode": "",
"country_code": "CH",
"administrative_area": null,
"locality": "Lausanne",
"dependent_locality": null,
"postal_code": "2502",
"sorting_code": null,
"address_line1": "Straßenname 12",
"address_line2": "Zusätzliche Adressinformationen",
"organization": "Association Films Plans-Fixes",
"given_name": null,
"additional_name": null,
"family_name": null
},
{
"langcode": "",
"country_code": "DE",
"administrative_area": null,
"locality": "Berlin",
"dependent_locality": null,
"postal_code": "2502",
"sorting_code": null,
"address_line1": " Ostseestraße 107",
"address_line2": "",
"organization": "",
"given_name": null,
"additional_name": null,
"family_name": null
}
],
"field_email": "info@plans-fixess.ch",
"field_isil": "PlansFixes-All",
"field_link_archive_catalog": {
"uri": "http://www.plansfixes.ch/indexation/",
"title": "",
"options": [
]
},
"field_text": {
"value": "<p>Association Films Plans-Fixes</p>\r\n",
"format": "editorial",
"processed": "Association Films Plans-Fixes"
},
"field_website": {
"uri": "http://www.plansfixes.ch/",
"title": "",
"options": [
]
}
}
}
\ No newline at end of file
{"id":"https://memobase.ch/record/BAZ-MEI_49884","dateCreated.date":"1921-09-14","dateCreated.facet":["0~20.Jahrhundert~","1~20.Jahrhundert~1921-1930#"],"relation":"Bezug Findmittel: Auftragsregister Bd. 6; Bildverzeichnis Bd. 7","placeCaptureRaw":{"de":"Zürich"},"descriptiveNote":{"de":"Villa mit Garten und Brunnen im Vordergrund. Vermutlich von Architekt Walz"},"title":{"de":"«Villa Siegel», Zürich"},"AgentContributorRaw.relation":"Auftraggeber","AgentContributorRaw.name":{"de":"Walz"},"CorporateBodyCreatorRaw.relation":"Fotograf","CorporateBodyCreatorRaw.name":{"de":"Atelier Meiner"},"rightsHolder":"BAZ","type.keyword":"Foto"}
\ No newline at end of file
_:B20812738X2Db181X2D46eaX2Da65cX2D72b85dbc6553 <https://www.ica.org/standards/RiC/ontology#identifier> "test" .
_:B20812738X2Db181X2D46eaX2Da65cX2D72b85dbc6553 <https://www.ica.org/standards/RiC/ontology#type> "main" .
_:B20812738X2Db181X2D46eaX2Da65cX2D72b85dbc6553 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.ica.org/standards/RiC/ontology#Identifier> .
<https://memobase.ch/institution/test> <http://www.wikidata.org/prop/direct/P856> "http://www.plansfixes.ch/" .
<https://memobase.ch/institution/test> <https://www.ica.org/standards/RiC/ontology#descriptiveNote> "<p>Association Films Plans-Fixes</p>"@it .
<https://memobase.ch/institution/test> <https://www.ica.org/standards/RiC/ontology#descriptiveNote> "<p>Association Films Plans-Fixes</p>"@fr .
<https://memobase.ch/institution/test> <https://www.ica.org/standards/RiC/ontology#descriptiveNote> "<p>Association Films Plans-Fixes</p>"@de .
<https://memobase.ch/institution/test> <http://www.wikidata.org/prop/direct/P2699> "http://www.plansfixes.ch/indexation/" .
<https://memobase.ch/institution/test> <https://www.ica.org/standards/RiC/ontology#identifiedBy> _:B20812738X2Db181X2D46eaX2Da65cX2D72b85dbc6553 .
<https://memobase.ch/institution/test> <https://www.ica.org/standards/RiC/ontology#name> "Association Films Plans-Fixes"@it .
<https://memobase.ch/institution/test> <https://www.ica.org/standards/RiC/ontology#name> "Association Films Plans-Fixes"@fr .
<https://memobase.ch/institution/test> <https://www.ica.org/standards/RiC/ontology#name> "Association Films Plans-Fixes"@de .
<https://memobase.ch/institution/test> <http://www.wikidata.org/prop/direct/P791> "12345" .
<https://memobase.ch/institution/test> <http://www.wikidata.org/prop/direct/P968> "info@plans-fixess.ch" .
<https://memobase.ch/institution/test> <http://www.wikidata.org/prop/direct/P6375> _:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd .
<https://memobase.ch/institution/test> <http://www.wikidata.org/prop/direct/P6375> _:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P281> "10409" .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P670> "107" .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.wikidata.org/entity/Q17334923> .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P17> <http://www.wikidata.org/entity/Q39> .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P6375> "Ostseestraße 107"@it .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P6375> "Ostseestraße 107"@fr .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P6375> "Ostseestraße 107"@de .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P669> "Ostseestraße"@it .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P669> "Ostseestraße"@fr .
_:B7da6890eX2Dc176X2D4ad4X2D9413X2D61c872645dfd <http://www.wikidata.org/prop/direct/P669> "Ostseestraße"@de .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P6375> "Straßenname 12\nZusätzliche Adressinformationen"@it .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P6375> "Straßenname 12\nZusätzliche Adressinformationen"@fr .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P6375> "Straßenname 12\nZusätzliche Adressinformationen"@de .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.wikidata.org/entity/Q17334923> .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P281> "1002" .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P670> "12" .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P17> <http://www.wikidata.org/entity/Q39> .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P669> "Straßenname"@it .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P669> "Straßenname"@fr .
_:B5dd20314X2D0d4bX2D4fdaX2D9e4aX2D0b8e789e167d <http://www.wikidata.org/prop/direct/P669> "Straßenname"@de .
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment