Commit 137ded76 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Fix person normalizer (duplicating names).

parent 1c5890cc
Pipeline #17601 passed with stage
in 2 minutes and 33 seconds
package ch.memobase.transform
import org.apache.jena.sparql.vocabulary.FOAF
import org.apache.log4j.LogManager
import ch.memobase.helpers.KEYS
import ch.memobase.rdf.MemobaseModel
import org.memobase.rdf.RICO
import ch.memobase.rdf.RicoResource
import org.apache.jena.sparql.vocabulary.FOAF
import org.apache.log4j.LogManager
import org.memobase.rdf.RICO
class PersonNormalizer(
private val nameOrder: String,
......@@ -19,11 +19,20 @@ class PersonNormalizer(
val names = item.listProperties(RICO.name).toList()
names.forEach { statement ->
val value = statement.`object`.asLiteral()
val text = value.string
val text = value.string.trim()
val splitNames = if (nameOrder == KEYS.firstToLast && nameDelimiter == " ") {
listOf(text.substringBeforeLast(nameDelimiter), text.substringAfterLast(nameDelimiter).trim())
listOf(
text.substringBeforeLast(
nameDelimiter,
missingDelimiterValue = if (singleNameIsLastName) "" else text
).trim(),
text.substringAfterLast(
nameDelimiter,
missingDelimiterValue = if (singleNameIsLastName) text else ""
).trim()
).filter { it != "" }
} else {
text.split(nameDelimiter, limit = 2).map { it.trim() }
text.split(nameDelimiter).map { it.trim() }
}
if (splitNames.size == 1) {
if (singleNameIsLastName) {
......
......@@ -17,8 +17,8 @@
*/
package org.memobase
import ch.memobase.model.NormalizeLanguages
import ch.memobase.rdf.MemobaseModel
import ch.memobase.rdf.RicoResource
import ch.memobase.transform.PersonNormalizer
import org.apache.jena.riot.RDFDataMgr
import org.apache.jena.riot.RDFFormat
......@@ -34,40 +34,94 @@ import java.io.FileOutputStream
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
internal class TestPersonNormalizer {
@Test
fun `test person normalizer`() {
private fun createModel(name: String): Pair<RicoResource, MemobaseModel> {
val memobaseModel = MemobaseModel()
NS.prefixMapping.map {
memobaseModel.setNsPrefix(it.key, it.value)
}
val resource =
memobaseModel.createRicoResource(RICO.Person)
.addLiteral(RICO.name, "Markus Mäder")
.addLiteral(RICO.name, name)
val record = memobaseModel.createRicoResource(RICO.Record)
.addProperty(RDA.hasProducer, resource)
return Pair(resource, memobaseModel)
}
private fun write(name: String, model: MemobaseModel) {
RDFDataMgr.write(
FileOutputStream("src/test/resources/tmp/${name}.ttl"),
model,
RDFFormat.TURTLE_PRETTY
)
}
@Test
fun `test input with both first and last name`() {
val source = createModel("Markus Mäder")
val n = PersonNormalizer(
"first-to-last",
true,
" "
)
val output = n.transform(resource, memobaseModel)
RDFDataMgr.write(
FileOutputStream("src/test/resources/tmp/turtle-output-person-normalization.ttl"),
memobaseModel,
RDFFormat.TURTLE_PRETTY
val output = n.transform(source.first, source.second)
write("both_names", source.second)
assertAll("",
{
assertThat(output)
.isEmpty()
},
{
assertThat(source.first.hasProperty(FOAF.firstName, "Markus")).isTrue()
},
{
assertThat(source.first.hasProperty(FOAF.lastName, "Mäder")).isTrue()
}
)
}
@Test
fun `test input with only first name`() {
val source = createModel("Markus")
val n = PersonNormalizer(
"last-to-first",
false,
" "
)
val output = n.transform(source.first, source.second)
write("first_name_only", source.second)
assertAll("",
{
assertThat(output).isEmpty()
},
{
assertThat(source.first.hasProperty(FOAF.firstName, "Markus")).isTrue()
},
{
assertThat(source.first.hasProperty(FOAF.lastName)).isFalse()
}
)
}
@Test
fun `test input with only last name`() {
val source = createModel("Mäder")
val n = PersonNormalizer(
"first-to-last",
true,
" "
)
val output = n.transform(source.first, source.second)
write("last_name_only", source.second)
assertAll("",
{
assertThat(output).isEmpty()
},
{
assertThat(resource).satisfies {
it.hasProperty(FOAF.firstName, "Markus")
}.satisfies {
it.hasProperty(FOAF.familyName, "Mäder")
}
assertThat(source.first.hasProperty(FOAF.lastName, "Mäder")).isTrue()
},
{
assertThat(source.first.hasProperty(FOAF.firstName)).isFalse()
}
)
}
......
......@@ -22,13 +22,10 @@
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
_:b0 a rico:CreationRelation ;
rico:creationRelationHasSource [ a rico:Record ;
rico:recordResourceOrInstantiationIsSourceOfCreationRelation
_:b0
] ;
rico:creationRelationHasTarget [ a rico:Person ;
rico:agentIsTargetOfCreationRelation
_:b0 ;
rico:name "Markus Mäder (Autor)"
] .
[ a rico:Record ;
rdau:P60441 [ a rico:Person ;
foaf:firstName "Markus" ;
foaf:lastName "Mäder" ;
rico:name "Markus Mäder"
]
] .
@prefix schema: <http://schema.org/> .
@prefix internal: <http://memobase.ch/internal/> .
@prefix mbrs: <https://memobase.ch/recordSet/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .
@prefix mbpo: <https://memobase.ch/physical/> .
@prefix mbcb: <https://memobase.ch/institution/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix wdtn: <http://www.wikidata.org/prop/direct-normalized/> .
@prefix mbdo: <https://memobase.ch/digital/> .
@prefix rdau: <http://rdaregistry.info/Elements/u/> .
@prefix fedora: <http://fedora.info/definitions/v4/repository#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rico: <https://www.ica.org/standards/RiC/ontology#> .
@prefix ebucore: <http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix mbr: <https://memobase.ch/record/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
[ a rico:Record ;
rdau:P60441 [ a rico:Person ;
foaf:firstName "Markus" ;
rico:name "Markus"
]
] .
@prefix schema: <http://schema.org/> .
@prefix internal: <http://memobase.ch/internal/> .
@prefix mbrs: <https://memobase.ch/recordSet/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .
@prefix mbpo: <https://memobase.ch/physical/> .
@prefix mbcb: <https://memobase.ch/institution/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix wdtn: <http://www.wikidata.org/prop/direct-normalized/> .
@prefix mbdo: <https://memobase.ch/digital/> .
@prefix rdau: <http://rdaregistry.info/Elements/u/> .
@prefix fedora: <http://fedora.info/definitions/v4/repository#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rico: <https://www.ica.org/standards/RiC/ontology#> .
@prefix ebucore: <http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix mbr: <https://memobase.ch/record/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
[ a rico:Record ;
rdau:P60441 [ a rico:Person ;
foaf:lastName "Mäder" ;
rico:name "Mäder"
]
] .
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment