Commit c526b186 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Refactor local transform loader.

Refactor styles / logger definitions.
Update tests.
parent 8b891d59
Pipeline #21453 passed with stage
in 2 minutes and 14 seconds
......@@ -27,7 +27,7 @@ import java.io.File
import java.io.FileInputStream
class GlobalTransformsLoader(file: String) {
private val log = LogManager.getLogger("GlobalTransformLoader")
private val log = LogManager.getLogger(this::class.java)
private val objectMapper = ObjectMapper(YAMLFactory()).registerKotlinModule()
private val transforms: GlobalTransform
......
......@@ -16,60 +16,62 @@
package ch.memobase.configs
import ch.memobase.exceptions.InvalidMappingException
import ch.memobase.helpers.KEYS
import ch.memobase.helpers.ValidationError
import ch.memobase.model.LocalTransform
import ch.memobase.rdf.InvalidInputException
import ch.memobase.reporting.Report
import ch.memobase.reporting.ReportStatus
import ch.memobase.transform.ITransformer
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.exc.MismatchedInputException
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import org.apache.logging.log4j.LogManager
import java.io.ByteArrayInputStream
import java.io.ByteArrayOutputStream
import java.util.regex.PatternSyntaxException
import org.apache.logging.log4j.LogManager
class LocalTransformsLoader(data: ByteArray) {
class LocalTransformsLoader(private val data: ByteArray) {
private val log = LogManager.getLogger("LocalTransformsLoader")
private val objectMapper = ObjectMapper(YAMLFactory()).registerKotlinModule()
val errorMessage: String
private val transforms: LocalTransform?
private val transforms = mutableListOf<ITransformer>()
init {
var message = ""
transforms = try {
objectMapper.readValue(ByteArrayInputStream(data), LocalTransform::class.java)
fun parse(key: String): Report {
return try {
if (data.isEmpty())
Report(key, ReportStatus.ignored, "[Local Transform] Ignored empty local transformation file.", KEYS.step)
else {
val results = objectMapper.readValue(ByteArrayInputStream(data), LocalTransform::class.java)
results.splitEntity?.forEach {
transforms.add(it.generate())
}
results.normalizePerson?.generate().let {
if (it != null) {
transforms.addAll(it)
}
}
Report(key, ReportStatus.success, "", KEYS.step)
}
} catch (ex: MismatchedInputException) {
message = "The local transform file could not be parsed: ${ex.localizedMessage}."
val message = "[Local Transform] YamlParserError: ${ex.localizedMessage}"
log.error(message)
Report(key, ReportStatus.fatal, message, KEYS.step)
} catch (ex: ValidationError) {
val message = "[Local Transform] ValidationError: ${ex.localizedMessage}"
log.error(message)
null
} catch (ex: InvalidMappingException) {
message = "The local transform file could not be parsed: ${ex.localizedMessage}."
Report(key, ReportStatus.fatal, message, KEYS.step)
} catch (ex: PatternSyntaxException) {
val message = "[Local Transform] RegexError: ${ex.localizedMessage}"
log.error(message)
null
Report(key, ReportStatus.fatal, message, KEYS.step)
} catch (ex: Exception) {
message = "Unknown exception while parsing local transform: ${ex.localizedMessage}."
val message = "[Local Transform] ${ex.javaClass.name}: ${ex.localizedMessage}"
log.error(message)
null
Report(key, ReportStatus.fatal, message, KEYS.step)
}
errorMessage = message
}
fun get(): List<ITransformer> {
val localTransforms = mutableListOf<ITransformer>()
transforms?.splitEntity?.forEach {
localTransforms.add(it.generate())
}
transforms?.normalizePerson?.generate().let {
if (it != null) {
localTransforms.addAll(it)
}
}
return localTransforms
}
fun getByteStream(): ByteArray {
val writer = objectMapper.writerFor(LocalTransform::class.java)
val output = ByteArrayOutputStream()
writer.writeValue(output, transforms)
return output.toByteArray()
return transforms
}
}
\ No newline at end of file
......@@ -28,4 +28,6 @@ object KEYS {
const val missingLabelDe = "FEHLENDES LABEL"
const val missingLabelFr = "L'ÉTIQUETTE MANQUANTE"
const val missingLabelIt = "GALATEO MANCANTE"
const val step = "normalization-service"
}
/*
Copyright 2020 Jonas Waeber
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ch.memobase.helpers
/**
* This class is used to signal a configuration validation error.
*/
class ValidationError(message: String) : Exception(message)
......@@ -15,8 +15,8 @@
*/
package ch.memobase.model
import ch.memobase.exceptions.InvalidMappingException
import ch.memobase.helpers.KEYS.extractCreationRelationNamePatternGroupName
import ch.memobase.helpers.ValidationError
import ch.memobase.transform.ExtractCreationRelationNameTransform
data class ExtractCreationRelationName(
......@@ -27,16 +27,13 @@ data class ExtractCreationRelationName(
"de", "fr", "it", "NONE"
)
init {
fun generate(): ExtractCreationRelationNameTransform {
if (!pattern.contains("(?<$extractCreationRelationNamePatternGroupName>")) {
throw InvalidMappingException("The pattern of the 'extractRelationName' transform requires a named group 'relation'.")
throw ValidationError("Property 'extractRelationName.pattern' requires a named group '(?<$extractCreationRelationNamePatternGroupName>[captured-text])' to be defined, instead was '$pattern'.")
}
if (!validLanguageTags.contains(language)) {
throw InvalidMappingException("The language of the 'extractRelationName' transform must be one of $validLanguageTags.")
throw ValidationError("Property 'extractRelationName.language' must be one of ${validLanguageTags.joinToString(", ")}, instead was '$language'.")
}
}
fun generate(): ExtractCreationRelationNameTransform {
return ExtractCreationRelationNameTransform(Regex(pattern), if (language == "NONE") "" else language)
}
}
......@@ -15,9 +15,10 @@
*/
package ch.memobase.model
import ch.memobase.exceptions.InvalidMappingException
import ch.memobase.helpers.ValidationError
import ch.memobase.transform.ITransformer
import ch.memobase.transform.PersonNormalizer
import org.apache.logging.log4j.LogManager
data class NormalizePerson(
val splitEntity: SplitEntity?,
......@@ -26,18 +27,16 @@ data class NormalizePerson(
val singleNameIsLastName: Boolean,
val nameDelimiter: String
) {
private val log = LogManager.getLogger(this::class.java)
private val validNameOrders = listOf(
"first-to-last",
"last-to-first"
)
init {
if (!validNameOrders.contains(nameOrder))
throw InvalidMappingException("Could not create a person normalizer: Invalid name order $nameOrder. Choose one of $validNameOrders.")
}
fun generate(): List<ITransformer> {
if (!validNameOrders.contains(nameOrder))
throw ValidationError("Property 'nameOrder' requires one of ${validNameOrders.joinToString(", ")}, instead was '$nameOrder'.")
val delimiter = if (nameDelimiter == "SPACE") " " else nameDelimiter
return listOfNotNull(
splitEntity?.generate(),
......
......@@ -15,6 +15,8 @@
*/
package ch.memobase.model
import ch.memobase.exceptions.InvalidMappingException
import ch.memobase.helpers.ValidationError
import ch.memobase.rdf.NS
import ch.memobase.transform.SplitEntityTransform
import org.apache.jena.rdf.model.ResourceFactory
......@@ -25,11 +27,27 @@ data class SplitEntity(
val delimiter: String
) {
fun generate(): SplitEntityTransform {
if (!type.contains(":")) {
throw ValidationError("Property 'type' should have format 'prefix:ClassName', but was '$type' instead.")
}
if (!property.contains(":")) {
throw ValidationError("Property 'property' should have format 'prefix:propertyName', but was '$property' instead.")
}
val typeParts = type.split(":")
val propertyParts = property.split(":")
val typeNamespace = try {
NS.prefixToNamespace(typeParts[0])
} catch (ex: InvalidMappingException) {
throw ValidationError(ex.localizedMessage)
}
val propertyNamespace = try {
NS.prefixToNamespace(propertyParts[0])
} catch (ex: InvalidMappingException) {
throw ValidationError(ex.localizedMessage)
}
return SplitEntityTransform(
type = ResourceFactory.createResource(NS.prefixToNamespace(typeParts[0]) + typeParts[1]),
literal = ResourceFactory.createProperty(NS.prefixToNamespace(propertyParts[0]), propertyParts[1]),
type = ResourceFactory.createResource(typeNamespace + typeParts[1]),
literal = ResourceFactory.createProperty(propertyNamespace, propertyParts[1]),
delimiter = if (delimiter == "SPACE") " " else delimiter
)
}
......
......@@ -15,4 +15,7 @@
*/
package ch.memobase.rdf
/**
* This exception is used to signal that the input message contains invalid data.
*/
class InvalidInputException(message: String) : Exception(message)
......@@ -23,20 +23,20 @@ import ch.memobase.rdf.MemobaseModel
import ch.memobase.rdf.RDF
import ch.memobase.rdf.RICO
import ch.memobase.rdf.RicoResource
import org.apache.log4j.LogManager
import org.apache.logging.log4j.LogManager
class CarrierTypeNormalizer(
private val facetsMap: Map<String, Facets>,
private val labelsMap: Map<String, Labels>
private val facetsMap: Map<String, Facets>,
private val labelsMap: Map<String, Labels>
) : ITransformer {
private val log = LogManager.getLogger("CarrierTypeNormalizer")
private val log = LogManager.getLogger(this::class.java)
override fun transform(item: RicoResource, model: MemobaseModel): List<String> {
return if (item.hasType(RICO.CarrierType)) {
if (item.hasProperty(RICO.name)) {
val allStatements = item.listProperties()
val copyStatements =
allStatements.filter { it.predicate != RICO.name }.filter { it.predicate != RDF.type }
allStatements.filter { it.predicate != RICO.name }.filter { it.predicate != RDF.type }
item.listProperties(RICO.name).mapNotNull { statement ->
val nameValue = statement.string.trim()
val record = findPhysicalInstantiation(model)
......@@ -45,20 +45,20 @@ class CarrierTypeNormalizer(
if (facetValues != null) {
facetValues.wikidata.forEach { facetValue ->
builder
.init()
.addLabels(RICO.name, labelsMap.getOrDefault(facetValue, Labels.default))
.addSameAsLink(facetValue)
.addOtherStatements(copyStatements)
.appendActivity(KEYS.carrierTypeNormalizerMechanismName, item)
.build()
.init()
.addLabels(RICO.name, labelsMap.getOrDefault(facetValue, Labels.default))
.addSameAsLink(facetValue)
.addOtherStatements(copyStatements)
.appendActivity(KEYS.carrierTypeNormalizerMechanismName, item)
.build()
}
facetValues.freeFacetValue.forEach { facetValue ->
builder
.init()
.addLabels(RICO.name, labelsMap.getOrDefault(facetValue, Labels.default))
.addOtherStatements(copyStatements)
.appendActivity(KEYS.carrierTypeNormalizerMechanismName, item)
.build()
.init()
.addLabels(RICO.name, labelsMap.getOrDefault(facetValue, Labels.default))
.addOtherStatements(copyStatements)
.appendActivity(KEYS.carrierTypeNormalizerMechanismName, item)
.build()
}
null
} else {
......@@ -79,6 +79,6 @@ class CarrierTypeNormalizer(
private fun findPhysicalInstantiation(model: MemobaseModel): RicoResource {
return model.listRicoResourceSubjects().filter { it.hasProperty(RDF.type, RICO.Instantiation) }
.first { it.hasLiteral(RICO.type, "physicalObject") }
.first { it.hasLiteral(RICO.type, "physicalObject") }
}
}
......@@ -20,13 +20,15 @@ import ch.memobase.rdf.MemobaseModel
import ch.memobase.rdf.RDF
import ch.memobase.rdf.RICO
import ch.memobase.rdf.RicoResource
import org.apache.logging.log4j.LogManager
class DateNormalizationTransform(
private val singleDateMatchers: List<Regex>,
private val dateRangeMatchers: List<Regex>,
private val certaintyValues: List<Regex>,
private val qualifierValues: List<Regex>
private val singleDateMatchers: List<Regex>,
private val dateRangeMatchers: List<Regex>,
private val certaintyValues: List<Regex>,
private val qualifierValues: List<Regex>
) : ITransformer {
private val log = LogManager.getLogger(this::class.java)
private var value = ""
override fun transform(item: RicoResource, model: MemobaseModel): List<String> {
......@@ -42,11 +44,11 @@ class DateNormalizationTransform(
val match = it.matchEntire(value)
if (match != null) {
val day = match.groups["day"]?.value
?: error("Single day regex is missing capture group 'day': ${it.pattern}.")
?: error("Single day regex is missing capture group 'day': ${it.pattern}.")
val month = match.groups["month"]?.value
?: error("Single day regex is missing capture group 'month': ${it.pattern}.")
?: error("Single day regex is missing capture group 'month': ${it.pattern}.")
val year = match.groups["year"]?.value
?: error("Single day regex is missing capture group 'year': ${it.pattern}.")
?: error("Single day regex is missing capture group 'year': ${it.pattern}.")
getSingleDate(day, month, year)
} else {
null
......@@ -54,10 +56,10 @@ class DateNormalizationTransform(
}
if (normalizedSingleDate.isNotEmpty()) {
item
.removeAllProperties(RICO.expressedDate)
.addLiteral(RICO.normalizedDateValue, normalizedSingleDate[0])
.removeAllProperties(RDF.type)
.replaceRdfType(RICO.SingleDate)
.removeAllProperties(RICO.expressedDate)
.addLiteral(RICO.normalizedDateValue, normalizedSingleDate[0])
.removeAllProperties(RDF.type)
.replaceRdfType(RICO.SingleDate)
emptyList()
} else {
// if no single date match is found try to remove qualifiers and certainty strings.
......@@ -124,8 +126,6 @@ class DateNormalizationTransform(
null
}
}
// TODO: get language tags into this?
qualifiers.forEach {
item.addLiteral(RICO.dateQualifier, it)
}
......@@ -167,7 +167,14 @@ class DateNormalizationTransform(
return "$year-$month-$day"
}
private fun getDateRange(dayFrom: String?, dayUntil: String?, monthFrom: String?, monthUntil: String?, yearFrom: String?, yearUntil: String?): String {
private fun getDateRange(
dayFrom: String?,
dayUntil: String?,
monthFrom: String?,
monthUntil: String?,
yearFrom: String?,
yearUntil: String?
): String {
return when {
dayFrom != null && dayUntil != null && monthFrom != null && yearFrom != null ->
when {
......
......@@ -22,7 +22,7 @@ import ch.memobase.rdf.RicoResource
import org.apache.logging.log4j.LogManager
class ExtractCreationRelationNameTransform(private val regex: Regex, private val language: String) : ITransformer {
private val log = LogManager.getLogger("ExtractCreationRelationName")
private val log = LogManager.getLogger(this::class.java)
override fun transform(item: RicoResource, model: MemobaseModel): List<String> {
if (item.hasType(RICO.CreationRelation)) {
......
......@@ -25,14 +25,14 @@ import ch.memobase.rdf.RDF
import ch.memobase.rdf.RICO
import ch.memobase.rdf.RicoResource
import ch.memobase.rdf.SKOS
import org.apache.log4j.LogManager
import org.apache.logging.log4j.LogManager
class GenreNormalizer(
private val facetsMap: Map<String, Facets>,
private val labelsMap: Map<String, Labels>
) :
ITransformer {
private val log = LogManager.getLogger("GenreNormalizer")
private val log = LogManager.getLogger(this::class.java)
override fun transform(item: RicoResource, model: MemobaseModel): List<String> {
return if (item.hasType(SKOS.Concept)) {
......
......@@ -23,14 +23,14 @@ import ch.memobase.rdf.MemobaseModel
import ch.memobase.rdf.RDF
import ch.memobase.rdf.RICO
import ch.memobase.rdf.RicoResource
import org.apache.log4j.LogManager
import org.apache.logging.log4j.LogManager
class LanguagesNormalizer(
private val facetsMap: Map<String, Facets>,
private val labelsMap: Map<String, Labels>
) :
ITransformer {
private val log = LogManager.getLogger("LanguagesNormalizer")
private val log = LogManager.getLogger(this::class.java)
override fun transform(item: RicoResource, model: MemobaseModel): List<String> {
return if (item.hasType(RICO.Language)) {
......
......@@ -20,14 +20,14 @@ import ch.memobase.rdf.MemobaseModel
import ch.memobase.rdf.RICO
import ch.memobase.rdf.RicoResource
import org.apache.jena.sparql.vocabulary.FOAF
import org.apache.log4j.LogManager
import org.apache.logging.log4j.LogManager
class PersonNormalizer(
private val nameOrder: String,
private val singleNameIsLastName: Boolean,
private val nameDelimiter: String
) : ITransformer {
private val log = LogManager.getLogger("PersonNormalizer")
private val log = LogManager.getLogger(this::class.java)
override fun transform(item: RicoResource, model: MemobaseModel): List<String> {
return if (item.hasType(RICO.Person)) {
......
......@@ -22,9 +22,11 @@ import ch.memobase.rdf.RicoResource
import org.apache.jena.rdf.model.Property
import org.apache.jena.rdf.model.Resource
import org.apache.jena.rdf.model.impl.StatementImpl
import org.apache.logging.log4j.LogManager
class SplitEntityTransform(private val type: Resource, private val literal: Property, private val delimiter: String) :
ITransformer {
private val log = LogManager.getLogger(this::class.java)
private var oldCreationRelation: Resource? = null
......
/*
* Normalization Service Configuration
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.test
import ch.memobase.configs.GlobalTransformsLoader
import ch.memobase.configs.LocalTransformsLoader
import ch.memobase.rdf.MemobaseModel
import ch.memobase.settings.HeaderMetadata
import java.io.File
import org.apache.jena.riot.Lang
import org.apache.jena.riot.RDFDataMgr
import org.junit.jupiter.api.Disabled
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.TestInstance
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@Disabled
class LocalTestRun {
private val headerMetadata = HeaderMetadata(
"srf-001",
"1",
"srf",
false,
"record",
"identifierMain",
0, 0, 0, 0
)
/*
@Test
@Disabled
fun `test a single file`() {
val folder = "/home/jonas/memobase/data/apf-001"
val inputPath = "/home/jonas/memobase/data/test/step-2/Fonoteca-18BD1047_A29"
val outputPath = "/home/jonas/memobase/data/test/step-3/snp-001-18BD1047_A29"
val mappingFile = "/config/mapping.yml"
val klaxon = Klaxon()
val configurationParser = MappingConfigurationParser(File(folder + mappingFile).readBytes())
val configuration = configurationParser.get()
val item = klaxon.parse<Map<String, Any>>(File(inputPath)).orEmpty()
val builder = ResourceBuilder(
item,
configuration,
headerMetadata.institutionId,
headerMetadata.recordSetId,
headerMetadata.isPublished
)
val result = builder.extractRecordId()
.extractRecordTypeValue()
.generateRecord()
.generatePhysicalObject()
.generateDigitalObject()
.addDerivedConnection()
val writtenResult = result
.writeRecord(RDFFormat.NTRIPLES_UTF8)
val writtenResultTurtle = result
.writeRecord(RDFFormat.TURTLE_PRETTY)
FileOutputStream(File(outputPath)).use {
it.bufferedWriter().use { writer ->
writer.write(writtenResult.second)
}
}
FileOutputStream(File("output.ttl")).use {
it.bufferedWriter().use { writer ->
writer.write(writtenResultTurtle.second)
}
}
}
*/
@Test
@Disabled
fun `test local folder`() {
val folder = "/home/jonas/memobase/data/srf-001"
val inputFolder = "/home/jonas/memobase/data/test/step-3"
val outputFolder = "/home/jonas/memobase/data/test/step-4"
val mappingFile = "/config/localTransforms.yml"
val global = GlobalTransformsLoader("src/test/resources/global/transforms.yml")
val local = LocalTransformsLoader(File(folder + mappingFile).readBytes())
val transformConfigs = local.get() + global.get()
File(inputFolder)
.walk(FileWalkDirection.TOP_DOWN)
.maxDepth(1)
.filter { it.isFile }
.map { Pair(it.name, it) }