Commit 9146aa8b authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Finish implementation for institution search doc.

parent 1b42a14b
......@@ -50,6 +50,8 @@ dependencies {
// JSON Parser
implementation 'com.beust:klaxon:5.2'
// CSV Reader
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:0.7.3")
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'
implementation "org.jetbrains.kotlin:kotlin-script-runtime:1.3.71"
......
......@@ -25,4 +25,11 @@ spec:
name: "{{ .Values.kafkaConfigs }}"
- configMapRef:
name: "{{ .Values.deploymentName}}-app-config"
volumeMounts:
- name: instituion-type-labels
mountPath: "/configs/institution_types/"
volumes:
- name: instituion-type-labels
configMap:
name: "{{ .Values.instutionTypeLabels }}"
restartPolicy: Always
......@@ -9,4 +9,6 @@ outputTopic: search-doc-output-documents
inputTopic: search-doc-input-documents
reportingTopic: postprocessing-reporting
instutionTypeLabels: institution-type-labels
mediaServerUrl: https://media.memobase.k8s.unibas.ch/memo/
\ No newline at end of file
......@@ -18,24 +18,84 @@
package org.memobase
import com.beust.klaxon.JsonArray
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.memobase.builders.*
import org.memobase.helpers.*
import org.memobase.model.EnrichedDigitalMetadata
import org.memobase.model.InstitutionSearchDoc
import org.memobase.model.Schema
import org.memobase.model.SearchDoc
import org.memobase.rdf.NS
import org.memobase.model.*
class InstitutionSearchDocBuilder {
class InstitutionSearchDocBuilder(path: String) {
private val log = LogManager.getLogger("InstitutionSearchDocBuilder")
fun transform(input: Map<String, JsonObject>): Schema {
private val institutionTypeMapper = InstitutionTypeMapper(path)
fun transform(key: String, input: Map<String, JsonObject>): Schema {
val institution = input["institution"] ?: throw InvalidInputException("No institution entity found in message $key.")
val identifiers = Filter.entitiesByProperty(KEYS.identifiedBy, institution, input)
val locations = Filter.entitiesByProperty(KEYS.hasLocation, institution, input)
val cantons = mutableListOf<JsonObject>()
input.values.forEach {
if (it[KEYS.ricoType] == KEYS.LocationType.canton) {
cantons.add(it)
}
}
return InstitutionSearchDoc(
val type = institution[KEYS.wikidataInstance].let {
when(it) {
is String -> listOf(institutionTypeMapper.getValue(it))
is JsonArray<*> -> it.map { any -> institutionTypeMapper.getValue(any as String) }
else -> {
log.error("Found no institution types on institution $key")
emptyList()
}
}
}
val name = Extract.languageContainer("institution", institution[KEYS.name]).let { names ->
when {
names.isEmpty() -> {
LanguageContainer.placeholder("NoNameFound")
}
names.size == 1 -> {
names[0]
}
else -> {
names.reduce { acc, languageContainer -> acc.merge(languageContainer) }
}
}
}
val canton = cantons.map {
it[KEYS.name].let { name ->
Extract.languageContainer("canton", name).reduce { acc, languageContainer -> acc.merge(languageContainer) }
}
}.let { c ->
when {
c.isEmpty() -> {
listOf(LanguageContainer.placeholder("NoCantonNameFound"))
}
else -> c
}
}
return InstitutionSearchDoc(
institutionId = Extract.extractIdValue(identifiers, KEYS.IdentifierType.main) ?: "NoIdentifierFound",
published = institution[KEYS.isPublished].let {
when (it) {
is String -> it.toBoolean()
else -> {
log.error("Found no isPublished property on institution $key. Set to false.")
false
}
}
},
type = type,
name = name,
documentType = listOf(LanguageContainer.placeholder("PLACEHOLDER")),
keyVisualLink = "placeholderlink",
canton = canton,
numberOfRecordSets = 0,
numberOfDocuments = 0
)
}
}
......@@ -18,7 +18,6 @@
package org.memobase
import com.beust.klaxon.JsonObject
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import java.io.StringWriter
......@@ -30,10 +29,9 @@ import org.apache.kafka.streams.kstream.Predicate
import org.apache.logging.log4j.LogManager
import org.memobase.helpers.Default
import org.memobase.helpers.JSON
import org.memobase.model.InstitutionSearchDoc
import org.memobase.helpers.KEYS
import org.memobase.model.Report
import org.memobase.model.Schema
import org.memobase.model.SearchDoc
import org.memobase.settings.SettingsLoader
class KafkaTopology(private val settings: SettingsLoader) {
......@@ -41,8 +39,8 @@ class KafkaTopology(private val settings: SettingsLoader) {
private val reportTopic = settings.processReportTopic
private val searchDocTransform = SearchDocTransform(settings.appSettings.getProperty(KEYS.mediaUrlPropName))
private val institutionSearchDoc = InstitutionSearchDocBuilder()
private val searchDocTransform = SearchDocTransform(settings.appSettings.getProperty(KEYS.SettingsProps.mediaUrl))
private val institutionSearchDoc = InstitutionSearchDocBuilder(settings.appSettings.getProperty(KEYS.SettingsProps.institutionTypeLabelsPath))
fun build(): Topology {
val builder = StreamsBuilder()
......@@ -71,7 +69,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
val institutionStream = branchedStream[1]
.mapValues { readOnlyKey, value ->
try {
Pair(institutionSearchDoc.transform(value), Report(readOnlyKey, "SUCCESS", "Transformed message into search doc."))
Pair(institutionSearchDoc.transform(readOnlyKey, value), Report(readOnlyKey, "SUCCESS", "Transformed message into search doc."))
} catch (ex: InvalidInputException) {
Pair(Default.institutionSearchDoc, Report(readOnlyKey, "FAILURE", ex.localizedMessage))
}
......@@ -86,12 +84,12 @@ class KafkaTopology(private val settings: SettingsLoader) {
private fun outputStreams(stream: KStream<String, Pair<Schema, Report>>) {
stream
.map { _, value -> KeyValue(value.second.id, value.second.toJson()) }
.mapValues { value -> value.second.toJson() }
.to(reportTopic)
stream
.filterNot { _, value -> value.second.status == "FAILURE" }
.map { _, value -> KeyValue(value.first.id, value.first) }
.mapValues { value -> value.first }
.mapValues { value ->
val writer = StringWriter()
ObjectMapper().registerKotlinModule().writeValue(writer, value)
......
......@@ -21,6 +21,7 @@ package org.memobase
import kotlin.system.exitProcess
import org.apache.kafka.streams.KafkaStreams
import org.apache.logging.log4j.LogManager
import org.memobase.helpers.KEYS
import org.memobase.settings.SettingsLoader
class Service(file: String = "app.yml") {
......@@ -28,7 +29,8 @@ class Service(file: String = "app.yml") {
val settings = SettingsLoader(
listOf(
KEYS.mediaUrlPropName
KEYS.SettingsProps.institutionTypeLabelsPath,
KEYS.SettingsProps.mediaUrl
),
file,
useStreamsConfig = true
......
......@@ -20,7 +20,7 @@ package org.memobase.builders
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.memobase.KEYS
import org.memobase.helpers.KEYS
import org.memobase.helpers.Extract
import org.memobase.helpers.FacetBuildHelpers
import org.memobase.model.AgentWithRelationContainer
......
......@@ -2,7 +2,7 @@ package org.memobase.builders
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.memobase.KEYS
import org.memobase.helpers.KEYS
import org.memobase.helpers.DateFacetBuildHelpers
import org.memobase.model.DateContainer
import org.memobase.rdf.NS
......
......@@ -19,7 +19,7 @@
package org.memobase.builders
import com.beust.klaxon.JsonObject
import org.memobase.KEYS
import org.memobase.helpers.KEYS
import org.memobase.helpers.Extract
import org.memobase.model.FacettedContainer
import org.memobase.rdf.NS
......
......@@ -20,7 +20,7 @@ package org.memobase.builders
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.memobase.KEYS
import org.memobase.helpers.KEYS
import org.memobase.helpers.FacetBuildHelpers
import org.memobase.rdf.NS
......
......@@ -20,7 +20,7 @@ package org.memobase.builders
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.memobase.KEYS
import org.memobase.helpers.KEYS
import org.memobase.helpers.FacetBuildHelpers
import org.memobase.rdf.NS
......
......@@ -19,7 +19,7 @@
package org.memobase.builders
import com.beust.klaxon.JsonObject
import org.memobase.KEYS
import org.memobase.helpers.KEYS
import org.memobase.helpers.Extract
import org.memobase.model.SuggestContainer
import org.memobase.rdf.NS
......
......@@ -4,12 +4,11 @@ import org.memobase.model.*
object Default {
val institutionSearchDoc = InstitutionSearchDoc(
"UnknownId",
false,
LanguageContainer(emptyList(), emptyList(), emptyList(), emptyList()),
LanguageContainer(emptyList(), emptyList(), emptyList(), emptyList()),
LanguageContainer.EMPTY,
listOf(LanguageContainer.EMPTY),
emptyList(),
emptyList(),
"",
......
......@@ -20,7 +20,6 @@ package org.memobase.helpers
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.memobase.KEYS
/**
* Helper functions to build hierarchical facet values for places and persons.
......
/*
* search-doc-service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.helpers
import org.memobase.model.LanguageContainer
class InstitutionTypeMapper(path: String) {
private val labels = LoadFile.readLabelFile(path)
fun getValue(uri: String): LanguageContainer {
val q = uri.substringAfterLast("/")
return labels[q] ?: LanguageContainer.DEFAULT
}
}
......@@ -16,10 +16,14 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
package org.memobase.helpers
object KEYS {
const val mediaUrlPropName = "media.url"
object SettingsProps {
const val mediaUrl = "media.url"
const val institutionTypeLabelsPath = "institutionTypeLabelsPath"
}
const val entityId = "@id"
const val atType = "@type"
......@@ -34,6 +38,7 @@ object KEYS {
const val agentIsTargetOfCreationRelation = "agentIsTargetOfCreationRelation"
const val hasSubject = "hasSubject"
const val hasLocation = "hasLocation"
const val placeOfCapture = "P60556"
const val spatial = "spatial"
const val producer = "P60441"
......@@ -45,6 +50,8 @@ object KEYS {
const val contributor = "contributor"
const val creator = "creator"
const val identifiedBy = "identifiedBy"
const val Person = "Person"
const val CorporateBody = "CorporateBody"
const val Agent = "Agent"
......@@ -64,6 +71,14 @@ object KEYS {
const val Concept = "Concept"
const val wikidataInstance = "P31"
const val missingLabelDe = "FEHLENDES LABEL"
const val missingLabelFr = "L'ÉTIQUETTE MANQUANTE"
const val missingLabelIt = "GALATEO MANCANTE"
const val missingLabelEn = "MISSING LABEL"
object TitleTypes {
const val main = "main"
const val series = "series"
......@@ -75,4 +90,11 @@ object KEYS {
const val original = "original"
const val callNumber = "callNumber"
}
object LocationType {
const val canton = "canton"
const val municipality = "municipality"
const val memobaseInstitution = "memobaseInstitution"
const val memobaseProject = "memobaseProject"
}
}
package org.memobase.helpers
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import org.memobase.model.LanguageContainer
import java.io.File
object LoadFile {
private val csv = csvReader()
fun readLabelFile(path: String): Map<String, LanguageContainer> {
val labelList = csv.readAll(File(path))
val labelsMap = mutableMapOf<String, LanguageContainer>()
for (row in labelList.listIterator(1)) {
labelsMap[row[0]] = LanguageContainer(listOf(row[1]), listOf(row[2]), listOf(row[3]), emptyList())
}
return labelsMap
}
}
\ No newline at end of file
......@@ -2,7 +2,6 @@ package org.memobase.helpers
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.memobase.KEYS
import org.memobase.model.FacettedContainer
import org.memobase.model.LanguageContainer
import org.memobase.rdf.NS
......
......@@ -10,7 +10,7 @@ data class InstitutionSearchDoc(
val name: LanguageContainer,
// Facettes
val canton: LanguageContainer,
val canton: List<LanguageContainer>,
val type: List<LanguageContainer>,
val documentType: List<LanguageContainer>,
......
......@@ -19,6 +19,7 @@
package org.memobase.model
import com.fasterxml.jackson.annotation.JsonInclude
import org.memobase.helpers.KEYS
@JsonInclude(JsonInclude.Include.NON_NULL)
data class LanguageContainer(
......@@ -27,6 +28,25 @@ data class LanguageContainer(
val it: List<String>,
val un: List<String> // if the language is not known
) {
companion object {
val EMPTY = LanguageContainer(
emptyList(),
emptyList(),
emptyList(),
emptyList()
)
val DEFAULT = LanguageContainer(
listOf(KEYS.missingLabelDe),
listOf(KEYS.missingLabelFr),
listOf(KEYS.missingLabelIt),
listOf(KEYS.missingLabelEn)
)
fun placeholder(placeholder: String): LanguageContainer {
return LanguageContainer(listOf(placeholder), listOf(placeholder), listOf(placeholder), listOf(placeholder))
}
}
fun toList(): List<String> {
return de + fr + it + un
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment