Commit 0e5460f2 authored by Jonas Waeber's avatar Jonas Waeber

[WIP] Update record sets.

parent 3d631a1b
Pipeline #21672 failed with stages
in 2 minutes and 28 seconds
......@@ -34,7 +34,8 @@ ext {
dependencies {
compile group: 'org.elasticsearch.client', name: 'elasticsearch-rest-high-level-client', version: '7.6.1'
implementation 'org.memobase:memobase-service-utilities:2.0.5'
implementation 'org.memobase:memobase-service-utilities:2.0.9'
implementation 'org.apache.jena:apache-jena:3.14.0'
// Logging Framework
implementation "org.apache.logging.log4j:log4j-api:${log4jV}"
......
......@@ -18,12 +18,16 @@
package org.memobase
import ch.memobase.rdf.DC
import ch.memobase.rdf.RDA
import ch.memobase.rdf.RDF
import ch.memobase.rdf.RICO
import com.beust.klaxon.JsonObject
import java.lang.NumberFormatException
import org.apache.logging.log4j.LogManager
import org.memobase.helpers.Date
import org.memobase.helpers.ElasticSearchWrapper
import org.memobase.helpers.Extract
import org.memobase.helpers.JSON
import org.memobase.helpers.KEYS
import org.memobase.model.FacetContainer
import org.memobase.model.IntegerRange
......@@ -33,46 +37,80 @@ import org.memobase.model.Schema
class RecordSetSearchDocBuilder(private val elasticSearchWrapper: ElasticSearchWrapper) {
private val log = LogManager.getLogger("RecordSetSearchDocBuilder")
private val log = LogManager.getLogger(this::class.java)
fun transform(key: String, input: Map<String, JsonObject>): Schema {
val recordSet =
input["recordSet"] ?: throw InvalidInputException("No recordSet entity found in message $key.")
input[JSON.recordSetTag] ?: throw InvalidInputException("No record set entity found in message $key.")
val publicationIds = Extract.identifiers(recordSet[RICO.isSubjectOf.localName])
val relatedRecordSetIds = Extract.identifiers(recordSet[RICO.isRecordResourceAssociatedWithRecordResource.localName])
val metadataLanguages = mutableListOf<JsonObject>()
var originalTitles = LanguageContainer.EMPTY
var projectTitles = LanguageContainer.EMPTY
var relatedRecordSets = LanguageContainer.EMPTY
var publicationTitles = LanguageContainer.EMPTY
var relatedDocumentTitles = LanguageContainer.EMPTY
input.values.forEach {
when {
it[KEYS.ricoType] == KEYS.LanguageType.metadata -> {
it[RICO.type.localName] == KEYS.LanguageType.metadata -> {
metadataLanguages.add(it)
}
it[KEYS.atType] == RICO.Title.uri &&
it[RICO.type.localName] == KEYS.TitleTypes.original -> {
originalTitles = originalTitles.add(it[RICO.title.localName])
}
it[KEYS.atType] == RICO.CorporateBody.uri &&
it[RICO.type.localName] == KEYS.CorporateBodyType.memoriavProject -> {
projectTitles = projectTitles.add(it[RICO.title.localName])
}
it[KEYS.atType] == RICO.RecordSet.uri &&
relatedRecordSetIds.contains(it[KEYS.entityId]) -> {
relatedRecordSets = relatedRecordSets.add(it[RICO.title.localName])
}
it[KEYS.atType] == RICO.Record.uri -> {
if (publicationIds.contains(it[KEYS.entityId])) {
publicationTitles = publicationTitles.add(it[RICO.title.localName])
} else {
relatedDocumentTitles = relatedDocumentTitles.add(it[RICO.title.localName])
}
}
}
}
val name = extractLanguageContainer(recordSet[KEYS.title], "NoNameFound")
val description = extractLanguageContainer(recordSet[KEYS.descriptiveNote], "NoDescriptionFound")
val dates = Extract.identifiers(recordSet[KEYS.isAssociatedWithDate]).mapNotNull {
val name = extractLanguageContainer(recordSet[RICO.title.localName], "")
val dates = Extract.identifiers(recordSet[RICO.isAssociatedWithDate.localName]).mapNotNull {
input[it]
}.map {
it[KEYS.normalizedDateValue] as String
it[RICO.normalizedDateValue.localName] as String
}
val date = if (dates.isNotEmpty()) {
try {
val splitDate = dates[0].split("/")
if (splitDate.size == 2) {
IntegerRange(splitDate[0].toInt(), splitDate[1].toInt())
}
else
} else
IntegerRange(splitDate[0].toInt(), splitDate[0].toInt())
} catch (ex: NumberFormatException) {
IntegerRange(3000, 3001)
null
}
} else {
IntegerRange(3000, 3001)
null
}
val uri = recordSet[KEYS.entityId] as String
val id = uri.substringAfterLast("/")
val institution = recordSet[KEYS.heldBy] as String
val institution = recordSet[RICO.heldBy.localName] as String
val institutionId = institution.substringAfterLast("/")
val description = extractLanguageContainer(recordSet[RICO.descriptiveNote.localName], "")
val rights = extractLanguageContainer(recordSet[RICO.conditionsOfUse.localName], "")
val access = extractLanguageContainer(recordSet[RICO.conditionsOfAccess.localName], "")
val accessMemobase = extractLanguageContainer(recordSet[RDA.hasRestrictionOnAccess.localName], "")
val history = extractLanguageContainer(recordSet[RICO.history.localName], "")
val integrity = extractLanguageContainer(recordSet[RICO.integrity.localName], "")
val extent = extractLanguageContainer(recordSet[RICO.recordResourceExtent.localName], "")
val scopeAndContent = extractLanguageContainer(recordSet[RICO.scopeAndContent.localName], "")
val conformsTo = extractLanguageContainer(recordSet[DC.conformsTo.localName], "")
val dataImport = extractLanguageContainer(recordSet[RICO.descriptiveNote.localName], "")
return RecordSetSearchDoc(
recordSetId = id,
......@@ -86,7 +124,6 @@ class RecordSetSearchDocBuilder(private val elasticSearchWrapper: ElasticSearchW
}
}
},
scopeAndContent = description,
periodOfTimeAsYear = date,
institution = elasticSearchWrapper.getInstitutionName(institutionId),
supportedByMemoriav = recordSet[KEYS.sponsoredBy] != null,
......@@ -107,7 +144,22 @@ class RecordSetSearchDocBuilder(private val elasticSearchWrapper: ElasticSearchW
},
emptyList()
)
}
},
scopeAndContent = scopeAndContent,
accessMemobase = accessMemobase,
context = history,
originalTitle = originalTitles,
extent = extent,
selection = integrity,
indexing = conformsTo,
rights = rights,
description = description,
access = access,
project = projectTitles,
relatedRecordSets = relatedRecordSets,
relatedPublications = publicationTitles,
relatedDocuments = relatedDocumentTitles,
dataImport = dataImport
)
}
......
......@@ -119,12 +119,12 @@ object Extract {
fun identifiers(value: Any?): List<String> {
return when (value) {
is String -> listOf(value)
is JsonObject -> value[KEYS.atType].let { if (it is String) listOf(it) else emptyList() }
is JsonObject -> value[KEYS.entityId].let { if (it is String) listOf(it) else emptyList() }
is JsonArray<*> ->
value.mapNotNull { item ->
when (item) {
is String -> item
is JsonObject -> value[KEYS.atType].let { id: Any? ->
is JsonObject -> value[KEYS.entityId].let { id: Any? ->
if (id is String)
id
else null
......
......@@ -18,6 +18,7 @@
package org.memobase.helpers
import ch.memobase.rdf.NS
import ch.memobase.rdf.RICO
import com.beust.klaxon.JsonArray
import com.beust.klaxon.JsonObject
import com.beust.klaxon.Klaxon
......@@ -33,7 +34,6 @@ object JSON {
private const val Record = "Record"
private const val RecordSet = "RecordSet"
private const val CorporateBody = "CorporateBody"
private const val memobaseInstitutionType = "memobaseInstitution"
const val institutionTag = "institution"
const val recordTag = "record"
const val recordSetTag = "recordSet"
......@@ -52,11 +52,11 @@ object JSON {
fun unpack(input: JsonObject): Map<String, JsonObject> {
val graph = input[graph] as JsonArray<JsonObject>
return graph.map {
if (it[KEYS.atType] == NS.rico + Record) {
if (it[KEYS.atType] == RICO.Record.uri) {
Pair(recordTag, it)
} else if (it[KEYS.atType] == NS.rico + RecordSet) {
} else if (it[KEYS.atType] == RICO.RecordSet.uri) {
Pair(recordSetTag, it)
} else if (it[KEYS.atType] == NS.rico + CorporateBody && it[KEYS.ricoType] == memobaseInstitutionType) {
} else if (it[KEYS.atType] == RICO.CorporateBody.uri && it[KEYS.ricoType] == KEYS.CorporateBodyType.memobaseInstitution) {
Pair(institutionTag, it)
} else {
Pair(it[KEYS.entityId] as String, it)
......
......@@ -35,9 +35,8 @@ object KEYS {
const val institutionIndex = "elastic.institutionIndex"
}
const val conditionsOfUse = "conditionsOfUse"
const val isAssociatedWithDate = "isAssociatedWithDate"
const val normalizedDateValue = "normalizedDateValue"
const val sameAs = "sameAs"
const val entityId = "@id"
const val atType = "@type"
......@@ -48,6 +47,8 @@ object KEYS {
const val teaserColorComputed = "teaserColorComputed"
// Namespace rico:
const val scopeAndContent = "scopeAndContent"
const val recordResourceExtent = "recordResourceExtent"
const val ricoType = "type"
const val firstName = "firstName"
const val lastName = "lastName"
......@@ -65,6 +66,13 @@ object KEYS {
const val identifiedBy = "identifiedBy"
const val hasSubject = "hasSubject"
const val hasLocation = "hasLocation"
const val integrity = "integrity"
const val history = "history"
const val conditionsOfAccess = "conditionsOfAccess"
const val conditionsOfUse = "conditionsOfUse"
const val isAssociatedWithDate = "isAssociatedWithDate"
const val normalizedDateValue = "normalizedDateValue"
// rico classes
const val Person = "Person"
......@@ -98,7 +106,7 @@ object KEYS {
const val issued = "issued"
const val created = "created"
const val temporal = "temporal"
const val conformsTo = "conformsTo"
// namespace wdt:
const val wikidataInstance = "P31"
......@@ -121,6 +129,7 @@ object KEYS {
}
object TitleTypes {
const val original = "original"
const val main = "main"
const val series = "series"
const val broadcast = "broadcast"
......@@ -135,7 +144,10 @@ object KEYS {
object LocationType {
const val canton = "canton"
const val municipality = "municipality"
}
object CorporateBodyType {
const val memobaseInstitution = "memobaseInstitution"
const val memobaseProject = "memobaseProject"
const val memoriavProject = "memoriavProject"
}
}
......@@ -23,18 +23,13 @@ import org.memobase.helpers.KEYS
@JsonInclude(JsonInclude.Include.NON_NULL)
data class LanguageContainer(
val de: List<String>,
val fr: List<String>,
val it: List<String>,
val un: List<String> // if the language is not known
val de: List<String> = emptyList(),
val fr: List<String> = emptyList(),
val it: List<String> = emptyList(),
val un: List<String> = emptyList() // if the language is not known
) {
companion object {
val EMPTY = LanguageContainer(
emptyList(),
emptyList(),
emptyList(),
emptyList()
)
val EMPTY = LanguageContainer()
val DEFAULT = LanguageContainer(
listOf(KEYS.missingLabelDe),
listOf(KEYS.missingLabelFr),
......@@ -59,6 +54,57 @@ data class LanguageContainer(
else -> EMPTY
}
}
const val valueJsonLDFieldName = "@value"
const val languageJsonLDFieldName = "@language"
}
fun add(value: Any?): LanguageContainer {
return when (value) {
null -> this
is String ->
LanguageContainer(
de, fr, it, un + value
)
is Map<*, *> -> {
value as Map<String, Any?>
mapRdfLanguageTags(value)
}
is List<*> -> {
value.map { item ->
when (item) {
is String -> LanguageContainer(un = listOf(item))
is Map<*, *> -> {
item as Map<String, Any?>
mapRdfLanguageTags(item)
}
else -> EMPTY
}
}.reduce { acc, languageContainer -> acc.merge(languageContainer) }
}
else -> this
}
}
private fun mapRdfLanguageTags(value: Map<String, Any?>): LanguageContainer {
val item = value[valueJsonLDFieldName].let {
when (it) {
is String -> it
else -> null
}
}
val language = value[languageJsonLDFieldName].let {
when (it) {
is String -> it
else -> null
}
}
return LanguageContainer(
if (language == "de" && item != null) de + item else de,
if (language == "fr" && item != null) fr + item else fr,
if (language == "it" && item != null) it + item else it,
if (language == null && item != null) un + item else un
)
}
fun toList(): List<String> {
......
......@@ -25,28 +25,43 @@ import com.fasterxml.jackson.annotation.JsonInclude
data class RecordSetSearchDoc(
@JsonIgnore
val recordSetId: String,
val name: LanguageContainer,
val isPublished: Boolean,
val name: LanguageContainer,
// Display
val keyVisualLink: String,
val numberOfDocuments: Int,
// Facets
val documentType: List<FacetContainer>,
val periodOfTimeAsYear: IntegerRange?,
val supportedByMemoriav: Boolean,
val languageOfMetadata: List<FacetContainer>,
val institution: FacetContainer,
val periodOfTimeAsYear: IntegerRange,
// Sort
val lastUpdatedDate: String,
// Search
val scopeAndContent: LanguageContainer,
// Display
val keyVisualLink: String,
val numberOfDocuments: Int
val scopeAndContent: LanguageContainer?,
val accessMemobase: LanguageContainer?,
val context: LanguageContainer?,
val originalTitle: LanguageContainer?,
val extent: LanguageContainer?,
val selection: LanguageContainer?,
val indexing: LanguageContainer?,
val rights: LanguageContainer?,
val description: LanguageContainer?,
val access: LanguageContainer?,
val project: LanguageContainer?,
val relatedRecordSets: LanguageContainer?,
val relatedPublications: LanguageContainer?,
val relatedDocuments: LanguageContainer?,
val dataImport: LanguageContainer?
) : Schema(recordSetId) {
companion object {
const val lorem = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
const val placeholderTitle = "Placeholder Title"
val DEFAULT = RecordSetSearchDoc(
recordSetId = "NoRecordSetId",
name = LanguageContainer.placeholder("TEST_RECORD_SET"),
......@@ -68,10 +83,26 @@ data class RecordSetSearchDoc(
facet = emptyList()
),
periodOfTimeAsYear = IntegerRange(2000, 2020),
scopeAndContent = LanguageContainer.placeholder("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."),
lastUpdatedDate = "2020-11-20T10:29:01.128",
keyVisualLink = "https://mb-wf1.memobase.unibas.ch/sites/default/files/styles/teaser/public/2020-10/1.jpg?itok=5ncVBnVQ",
numberOfDocuments = 100
numberOfDocuments = 100,
scopeAndContent = LanguageContainer.placeholder(lorem),
accessMemobase = LanguageContainer.placeholder(lorem),
context = LanguageContainer.placeholder(lorem),
originalTitle = LanguageContainer.placeholder(placeholderTitle),
extent = LanguageContainer.placeholder(lorem),
selection = LanguageContainer.placeholder(lorem),
indexing = LanguageContainer.placeholder(lorem),
rights = LanguageContainer.placeholder(lorem),
description = LanguageContainer.placeholder(lorem),
access = LanguageContainer.placeholder(lorem),
project = LanguageContainer.placeholder(placeholderTitle),
relatedRecordSets = LanguageContainer.placeholder(placeholderTitle),
relatedPublications = LanguageContainer.placeholder(placeholderTitle),
relatedDocuments = LanguageContainer.placeholder(placeholderTitle),
dataImport = LanguageContainer.placeholder(lorem)
)
}
}
\ No newline at end of file
package org.memobase
import ch.memobase.reporting.Report
import ch.memobase.reporting.ReportStatus
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import io.mockk.every
......@@ -43,33 +44,33 @@ class TestRecordSetSearchDoc {
@Test
fun `transform record set search doc`() {
val settings = App.createSettings("kafkaTest1.yml")
val data = readFile("input1.json")
val data = readFile("completeExample.json")
val wrapper = mockk<ElasticSearchWrapper>()
every { wrapper.countNumberOfDocuments("sap-021") } returns 102
every { wrapper.getDocumentTypesFromRecords("sap-021", "recordSet.facet") } returns listOf(FacetContainer(LanguageContainer(listOf("Fotographie"), listOf("Photographie"), listOf("Fotografia"), emptyList()), null, emptyList()))
every { wrapper.getInstitutionName("sap") } returns FacetContainer(
every { wrapper.countNumberOfDocuments("testComplete") } returns 102
every { wrapper.getDocumentTypesFromRecords("testComplete", "recordSet.facet") } returns listOf(FacetContainer(LanguageContainer(listOf("Fotographie"), listOf("Photographie"), listOf("Fotografia"), emptyList()), null, emptyList()))
every { wrapper.getInstitutionName("completeInstitution") } returns FacetContainer(
LanguageContainer(
listOf("Staatsarchiv des Kantons Aargau"),
listOf("Archives de l’Etat Argovie"),
listOf("Archivio cantonale del Argovia"),
listOf("Test Complete"),
listOf("Test Complete"),
listOf("Test Complete"),
emptyList()
),
"sap",
"completeExampleTest",
emptyList()
)
val input = JSON.unpack(JSON.parse(data))
val searchDocBuilder = RecordSetSearchDocBuilder(wrapper)
val result = searchDocBuilder.transform("https://memobase.ch/recordSet/sap-021", input)
val result = searchDocBuilder.transform("https://memobase.ch/recordSet/completeExample", input)
result as RecordSetSearchDoc
val resultString = result.toJson().replace(TestUtilities.dateRegex, "2020")
val targetString = readFile("output1.json").replace(TestUtilities.dateRegex, "2020")
val targetString = readFile("completeExampleOutput.json").replace(TestUtilities.dateRegex, "2020")
assertAll("",
{
assertThat(result.id).isEqualTo("sap-021")
assertThat(result.id).isEqualTo("testComplete")
},
{
assertThat(resultString).isEqualTo(
......@@ -100,8 +101,8 @@ class TestRecordSetSearchDoc {
testDriver.pipeInput(
factory.create(
settings.inputTopic,
"FSS-HM",
readFile("input1.json")
"testComplete",
readFile("completeExample.json")
)
)
......@@ -122,21 +123,21 @@ class TestRecordSetSearchDoc {
val key = record.key()
val value = record.value().replace(TestUtilities.dateRegex, "2020")
val resultValue = readFile("output1.json").replace(TestUtilities.dateRegex, "2020")
val resultValue = readFile("completeExampleOutput.json").replace(TestUtilities.dateRegex, "2020")
assertAll("",
{
assertThat(value)
.isEqualTo(resultValue)
},
{ assertThat(key).isEqualTo("FSS-HM") },
{ assertThat(reportKey).isEqualTo("FSS-HM") },
{ assertThat(key).isEqualTo("testComplete") },
{ assertThat(reportKey).isEqualTo("testComplete") },
{
assertThat(reportValue).isEqualTo(
Report(
"FSS-HM",
"SUCCESS",
"Transformed message into search doc.",
"testComplete",
ReportStatus.success,
"",
Service.name
)
)
......
{"published":false,"name":{"de":["Complete Example"],"fr":["Complete Example"],"it":["Complete Example"],"un":[]},"description":{"de":["<p>Beschreibung (DE)</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"],"fr":["<p>Beschreibung (FR)</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"],"it":["<p>Beschreibung (IT)</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"],"un":[]},"city":[{"de":[],"fr":[],"it":[],"un":["City"]}],"address":["Street Address"],"postalCodes":["1000"],"canton":[{"name":{"de":["Aargau"],"fr":["Argovie"],"it":["Argovia"],"un":[]}}],"type":[{"name":{"de":["Veranstalter"],"fr":["Organisateur"],"it":["Organizzatore"],"un":[]},"filter":"Q2029941"}],"documentType":[{"name":{"de":["Foto"],"fr":["Foto"],"it":["Foto"],"un":[]},"filter":"Foto"}],"lastUpdatedDate":"2020","keyVisualLink":"https://mb-wf1.memobase.unibas.ch/sites/default/files/styles/teaser/public/2021-02/vitrine1_hero.jpg?itok=S-b5nq1p","numberOfRecordSets":1,"numberOfDocuments":123,"id":"completeExampleTest"}
\ No newline at end of file
{
"@graph": [
{
"@id": "_:b0",
"@type": "https://www.ica.org/standards/RiC/ontology#Identifier",
"identifier": "sts",
"type": "main"
},
{
"@id": "_:b1",
"@type": "https://www.ica.org/standards/RiC/ontology#Place",
"P131": [
"_:b2",
"_:b3"
],
"P17": "http://www.wikidata.org/entity/Q39",
"P281": "8200",
"P625": "8.6337185, 47.6973",
"P6375": "Fronwagplatz 24",
"P669": "Fronwagplatz",
"P670": "24"
},
{
"@id": "_:b2",
"@type": "https://www.ica.org/standards/RiC/ontology#Place",
"sameAs": "http://www.wikidata.org/entity/Q12697",
"name": [
{
"@language": "de",
"@value": "Schaffhausen"
},
{
"@language": "fr",
"@value": "Schaffhouse"
},
{
"@language": "it",
"@value": "Schaffhouse"
}
],
"type": "canton"
},
{
"@id": "_:b3",
"@type": "https://www.ica.org/standards/RiC/ontology#Place",
"sameAs": "http://www.wikidata.org/entity/Q9009",
"name": [
{
"@language": "de",
"@value": "Schaffhausen"
},
{
"@language": "fr",
"@value": "Schaffhouse"
},
{
"@language": "it",
"@value": "Sciaffusa"
}
],
"type": "municipality"
},
{