Commit 1861f94c authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Add institution name retrieval.

Add date range for timeperiod.
Add metadata language values
parent 0a712448
Pipeline #20596 passed with stages
in 5 minutes and 21 seconds
...@@ -34,7 +34,7 @@ ext { ...@@ -34,7 +34,7 @@ ext {
dependencies { dependencies {
compile group: 'org.elasticsearch.client', name: 'elasticsearch-rest-high-level-client', version: '7.6.1' compile group: 'org.elasticsearch.client', name: 'elasticsearch-rest-high-level-client', version: '7.6.1'
implementation 'org.memobase:memobase-service-utilities:2.0.0' implementation 'org.memobase:memobase-service-utilities:2.0.5'
// Logging Framework // Logging Framework
implementation "org.apache.logging.log4j:log4j-api:${log4jV}" implementation "org.apache.logging.log4j:log4j-api:${log4jV}"
......
...@@ -5,7 +5,8 @@ metadata: ...@@ -5,7 +5,8 @@ metadata:
namespace: memobase namespace: memobase
data: data:
APPLICATION_ID: "{{ .Values.deploymentName }}-app" APPLICATION_ID: "{{ .Values.deploymentName }}-app"
ELASTIC_INDEX: "{{ .Values.elasticIndex }}" DOCUMENTS_INDEX: "{{ .Values.documentsIndex }}"
INSTITUTION_INDEX: "{{ .Values.institutionIndex }}"
MEDIA_SERVER_URL: "{{ .Values.mediaServerUrl }}" MEDIA_SERVER_URL: "{{ .Values.mediaServerUrl }}"
TOPIC_IN: "{{ .Values.inputTopic }}" TOPIC_IN: "{{ .Values.inputTopic }}"
TOPIC_OUT: "{{ .Values.outputTopic }}" TOPIC_OUT: "{{ .Values.outputTopic }}"
......
...@@ -7,7 +7,8 @@ deploymentName: search-doc-service ...@@ -7,7 +7,8 @@ deploymentName: search-doc-service
kafkaConfigs: prod-kafka-bootstrap-servers kafkaConfigs: prod-kafka-bootstrap-servers
elasticConfigs: prod-elastic-configs elasticConfigs: prod-elastic-configs
elasticIndex: documents-v17 documentsIndex: documents-v17
institutionIndex: institutions-v1
outputTopic: search-doc-output-documents outputTopic: search-doc-output-documents
inputTopic: search-doc-input-documents inputTopic: search-doc-input-documents
reportingTopic: postprocessing-reporting reportingTopic: postprocessing-reporting
......
...@@ -35,7 +35,8 @@ class App { ...@@ -35,7 +35,8 @@ class App {
SettingsProps.mediaUrl, SettingsProps.mediaUrl,
SettingsProps.elasticHost, SettingsProps.elasticHost,
SettingsProps.elasticPort, SettingsProps.elasticPort,
SettingsProps.elasticIndex SettingsProps.documentsIndex,
SettingsProps.institutionIndex
), ),
file, file,
useStreamsConfig = true useStreamsConfig = true
......
...@@ -19,16 +19,17 @@ ...@@ -19,16 +19,17 @@
package org.memobase package org.memobase
import com.beust.klaxon.JsonObject import com.beust.klaxon.JsonObject
import java.lang.NumberFormatException
import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.LogManager
import org.memobase.helpers.Date import org.memobase.helpers.Date
import org.memobase.helpers.ElasticSearchWrapper import org.memobase.helpers.ElasticSearchWrapper
import org.memobase.helpers.Extract import org.memobase.helpers.Extract
import org.memobase.helpers.KEYS import org.memobase.helpers.KEYS
import org.memobase.model.FacetContainer import org.memobase.model.FacetContainer
import org.memobase.model.IntegerRange
import org.memobase.model.LanguageContainer import org.memobase.model.LanguageContainer
import org.memobase.model.RecordSetSearchDoc import org.memobase.model.RecordSetSearchDoc
import org.memobase.model.Schema import org.memobase.model.Schema
import org.memobase.model.IntegerRange
class RecordSetSearchDocBuilder(private val elasticSearchWrapper: ElasticSearchWrapper) { class RecordSetSearchDocBuilder(private val elasticSearchWrapper: ElasticSearchWrapper) {
...@@ -36,47 +37,77 @@ class RecordSetSearchDocBuilder(private val elasticSearchWrapper: ElasticSearchW ...@@ -36,47 +37,77 @@ class RecordSetSearchDocBuilder(private val elasticSearchWrapper: ElasticSearchW
fun transform(key: String, input: Map<String, JsonObject>): Schema { fun transform(key: String, input: Map<String, JsonObject>): Schema {
val recordSet = val recordSet =
input["recordSet"] ?: throw InvalidInputException("No recordSet entity found in message $key.") input["recordSet"] ?: throw InvalidInputException("No recordSet entity found in message $key.")
val identifiers = mutableListOf<JsonObject>() val metadataLanguages = mutableListOf<JsonObject>()
input.values.forEach { input.values.forEach {
when { when {
it[KEYS.ricoType] == KEYS.IdentifierType.main -> { it[KEYS.ricoType] == KEYS.LanguageType.metadata -> {
identifiers.add(it) metadataLanguages.add(it)
} }
} }
} }
val name = extractLanguageContainer(recordSet[KEYS.title], "NoNameFound") val name = extractLanguageContainer(recordSet[KEYS.title], "NoNameFound")
val description = extractLanguageContainer(recordSet[KEYS.descriptiveNote], "NoDescriptionFound") val description = extractLanguageContainer(recordSet[KEYS.descriptiveNote], "NoDescriptionFound")
val id = Extract.extractIdValue(identifiers, KEYS.IdentifierType.main) ?: "NoIdentifierFound" val dates = Extract.identifiers(recordSet[KEYS.isAssociatedWithDate]).mapNotNull {
val institution = recordSet[KEYS.heldBy] as String? input[it]
if (institution != null) { }.map {
// TODO: it[KEYS.normalizedDateValue] as String
}
val date = if (dates.isNotEmpty()) {
try {
val splitDate = dates[0].split("/")
if (splitDate.size == 2) {
IntegerRange(splitDate[0].toInt(), splitDate[1].toInt())
}
else
IntegerRange(splitDate[0].toInt(), splitDate[0].toInt())
} catch (ex: NumberFormatException) {
IntegerRange(3000, 3001)
}
} else {
IntegerRange(3000, 3001)
} }
val uri = recordSet[KEYS.entityId] as String
val id = uri.substringAfterLast("/")
val institution = recordSet[KEYS.heldBy] as String
val institutionId = institution.substringAfterLast("/")
return RecordSetSearchDoc( return RecordSetSearchDoc(
recordSetId = id, recordSetId = id,
isPublished = recordSet[KEYS.isPublished].let { isPublished = recordSet[KEYS.isPublished].let {
when (it) { when (it) {
is Boolean -> it is Boolean -> it
is String -> it.toBoolean() is String -> it.toBoolean()
else -> { else -> {
log.error("Found no isPublished property on record set $key. Set to false.") log.error("Found no isPublished property on record set $key. Set to false.")
false false
}
} }
}, }
scopeAndContent = description, },
periodOfTimeAsYear = IntegerRange(1920, 2020), scopeAndContent = description,
institution = FacetContainer(LanguageContainer.placeholder("NoNameInstitution"), filter = institution, facet = emptyList()), periodOfTimeAsYear = date,
supportedByMemoriav = recordSet[KEYS.sponsoredBy] != null, institution = elasticSearchWrapper.getInstitutionName(institutionId),
supportedByMemoriav = recordSet[KEYS.sponsoredBy] != null,
name = name, name = name,
documentType = elasticSearchWrapper.getDocumentTypesFromRecords(id, KEYS.QueryFields.recordSetFacet), documentType = elasticSearchWrapper.getDocumentTypesFromRecords(id, KEYS.QueryFields.recordSetFacet),
keyVisualLink = recordSet[KEYS.wikidataImage].let { if (it != null) it as String else "NoKeyVisualLinkDefined" }, keyVisualLink = recordSet[KEYS.wikidataImage].let { if (it != null) it as String else "NoKeyVisualLinkDefined" },
numberOfDocuments = elasticSearchWrapper.countNumberOfDocuments(id), numberOfDocuments = elasticSearchWrapper.countNumberOfDocuments(id),
lastUpdatedDate = Date.now, lastUpdatedDate = Date.now,
languageOfMetadata = FacetContainer(LanguageContainer.placeholder("Deutsch"), filter = null, facet = emptyList()) languageOfMetadata = metadataLanguages.map {
FacetContainer(
extractLanguageContainer(it[KEYS.name], "NoMetadataLanguageSet"),
it[KEYS.sameAs].let { wikidataUri ->
when (wikidataUri) {
is String -> wikidataUri.substringAfterLast("/")
else -> null
}
},
emptyList()
)
}
) )
} }
......
...@@ -53,7 +53,8 @@ class Service(settings: SettingsLoader) { ...@@ -53,7 +53,8 @@ class Service(settings: SettingsLoader) {
private val host = appSettings.getProperty(SettingsProps.elasticHost) private val host = appSettings.getProperty(SettingsProps.elasticHost)
private val port = appSettings.getProperty(SettingsProps.elasticPort).toInt() private val port = appSettings.getProperty(SettingsProps.elasticPort).toInt()
private val documentsIndex = appSettings.getProperty(SettingsProps.elasticIndex) private val documentsIndex = appSettings.getProperty(SettingsProps.documentsIndex)
private val institutionIndex = appSettings.getProperty(SettingsProps.institutionIndex)
private val client: RestHighLevelClient = connect() private val client: RestHighLevelClient = connect()
private fun connect(): RestHighLevelClient { private fun connect(): RestHighLevelClient {
...@@ -65,12 +66,14 @@ class Service(settings: SettingsLoader) { ...@@ -65,12 +66,14 @@ class Service(settings: SettingsLoader) {
) )
val indexExists = c.indices().exists(GetIndexRequest(documentsIndex), RequestOptions.DEFAULT) val indexExists = c.indices().exists(GetIndexRequest(documentsIndex), RequestOptions.DEFAULT)
val aliasExists = c.indices().existsAlias(GetAliasesRequest(documentsIndex), RequestOptions.DEFAULT) val aliasExists = c.indices().existsAlias(GetAliasesRequest(documentsIndex), RequestOptions.DEFAULT)
val institutionIndexExists = c.indices().exists(GetIndexRequest(institutionIndex), RequestOptions.DEFAULT)
val institutionIndexAliasExists = c.indices().existsAlias(GetAliasesRequest(institutionIndex), RequestOptions.DEFAULT)
if (!indexExists && !aliasExists) { if (!indexExists && !aliasExists && !institutionIndexExists && !institutionIndexAliasExists) {
log.error("Could not find the index or alias defined in the configuration: $documentsIndex.") log.error("Could not find the indices or aliases defined in the configuration: $documentsIndex, $institutionIndex.")
exitProcess(1) exitProcess(1)
} else { } else {
log.info("Successfully connected to index $documentsIndex. Ready to query.") log.info("Successfully connected to indices $documentsIndex and $institutionIndex. Ready to query.")
c c
} }
} catch (ex: ElasticsearchException) { } catch (ex: ElasticsearchException) {
......
...@@ -22,6 +22,7 @@ import com.beust.klaxon.KlaxonException ...@@ -22,6 +22,7 @@ import com.beust.klaxon.KlaxonException
import java.util.Properties import java.util.Properties
import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.LogManager
import org.elasticsearch.ElasticsearchException import org.elasticsearch.ElasticsearchException
import org.elasticsearch.action.get.GetRequest
import org.elasticsearch.action.search.ClearScrollRequest import org.elasticsearch.action.search.ClearScrollRequest
import org.elasticsearch.action.search.SearchRequest import org.elasticsearch.action.search.SearchRequest
import org.elasticsearch.action.search.SearchScrollRequest import org.elasticsearch.action.search.SearchScrollRequest
...@@ -33,6 +34,8 @@ import org.elasticsearch.index.query.QueryBuilders.termQuery ...@@ -33,6 +34,8 @@ import org.elasticsearch.index.query.QueryBuilders.termQuery
import org.elasticsearch.search.Scroll import org.elasticsearch.search.Scroll
import org.elasticsearch.search.builder.SearchSourceBuilder import org.elasticsearch.search.builder.SearchSourceBuilder
import org.memobase.model.FacetContainer import org.memobase.model.FacetContainer
import org.memobase.model.LanguageContainer
import org.memobase.model.LanguageContainer.Companion
/** /**
...@@ -45,7 +48,8 @@ class ElasticSearchWrapper( ...@@ -45,7 +48,8 @@ class ElasticSearchWrapper(
private val translationMappers: TranslationMappers private val translationMappers: TranslationMappers
) { ) {
private val log = LogManager.getLogger("ElasticSearchWrapper") private val log = LogManager.getLogger("ElasticSearchWrapper")
private val documentsIndex = settings.getProperty(KEYS.SettingsProps.elasticIndex) private val documentsIndex = settings.getProperty(KEYS.SettingsProps.documentsIndex)
private val institutionIndex = settings.getProperty(KEYS.SettingsProps.institutionIndex)
private val klaxon = Klaxon() private val klaxon = Klaxon()
...@@ -143,4 +147,38 @@ class ElasticSearchWrapper( ...@@ -143,4 +147,38 @@ class ElasticSearchWrapper(
emptyList() emptyList()
} }
} }
fun getInstitutionName(identifier: String): FacetContainer {
return try {
log.info("Attempting to retrieve institution record.")
val request = GetRequest(institutionIndex, identifier)
val response = client.get(request, RequestOptions.DEFAULT)
if (response.isExists) {
log.info("Successfully retrieved institution name.")
FacetContainer(
LanguageContainer.fromMap(response.sourceAsMap.getValue("name")),
identifier,
emptyList()
)
} else {
log.error("Could not find institution $identifier in index $institutionIndex.")
FacetContainer(
LanguageContainer.EMPTY,
identifier,
emptyList()
)
}
} catch (ex: ElasticsearchException) {
log.error(ex.detailedMessage)
FacetContainer(
LanguageContainer.EMPTY,
identifier,
emptyList()
)
}
}
} }
\ No newline at end of file
...@@ -19,6 +19,10 @@ ...@@ -19,6 +19,10 @@
package org.memobase.helpers package org.memobase.helpers
object KEYS { object KEYS {
object LanguageType {
const val metadata = "metadata"
}
object SettingsProps { object SettingsProps {
const val accessTermLabelsPath = "accessTermLabelsPath" const val accessTermLabelsPath = "accessTermLabelsPath"
const val reuseStatementLabelsPath = "reuseStatementLabelsPath" const val reuseStatementLabelsPath = "reuseStatementLabelsPath"
...@@ -27,8 +31,13 @@ object KEYS { ...@@ -27,8 +31,13 @@ object KEYS {
const val institutionTypeLabelsPath = "institutionTypeLabelsPath" const val institutionTypeLabelsPath = "institutionTypeLabelsPath"
const val elasticHost = "elastic.host" const val elasticHost = "elastic.host"
const val elasticPort = "elastic.port" const val elasticPort = "elastic.port"
const val elasticIndex = "elastic.index" const val documentsIndex = "elastic.documentsIndex"
const val institutionIndex = "elastic.institutionIndex"
} }
const val isAssociatedWithDate = "isAssociatedWithDate"
const val normalizedDateValue = "normalizedDateValue"
const val sameAs = "sameAs"
const val entityId = "@id" const val entityId = "@id"
const val atType = "@type" const val atType = "@type"
......
...@@ -45,6 +45,20 @@ data class LanguageContainer( ...@@ -45,6 +45,20 @@ data class LanguageContainer(
fun placeholder(placeholder: String): LanguageContainer { fun placeholder(placeholder: String): LanguageContainer {
return LanguageContainer(listOf(placeholder), listOf(placeholder), listOf(placeholder), listOf(placeholder)) return LanguageContainer(listOf(placeholder), listOf(placeholder), listOf(placeholder), listOf(placeholder))
} }
fun fromMap(map: Any): LanguageContainer {
return when (map) {
is Map<*, *> -> {
LanguageContainer(
map["de"] as List<String>,
map["fr"] as List<String>,
map["it"] as List<String>,
map["un"] as List<String>
)
}
else -> EMPTY
}
}
} }
fun toList(): List<String> { fun toList(): List<String> {
......
...@@ -31,7 +31,7 @@ data class RecordSetSearchDoc( ...@@ -31,7 +31,7 @@ data class RecordSetSearchDoc(
// Facets // Facets
val documentType: List<FacetContainer>, val documentType: List<FacetContainer>,
val supportedByMemoriav: Boolean, val supportedByMemoriav: Boolean,
val languageOfMetadata: FacetContainer, val languageOfMetadata: List<FacetContainer>,
val institution: FacetContainer, val institution: FacetContainer,
val periodOfTimeAsYear: IntegerRange, val periodOfTimeAsYear: IntegerRange,
...@@ -57,11 +57,11 @@ data class RecordSetSearchDoc( ...@@ -57,11 +57,11 @@ data class RecordSetSearchDoc(
facet = emptyList() facet = emptyList()
)), )),
supportedByMemoriav = true, supportedByMemoriav = true,
languageOfMetadata = FacetContainer( languageOfMetadata = listOf(FacetContainer(
LanguageContainer.placeholder("TEST LANGUAGE"), LanguageContainer.placeholder("TEST LANGUAGE"),
filter = "", filter = "",
facet = emptyList() facet = emptyList()
), )),
institution = FacetContainer( institution = FacetContainer(
LanguageContainer.placeholder("TEST INSTITUTION"), LanguageContainer.placeholder("TEST INSTITUTION"),
filter = "INSTITUTION_IDENTIFIER", filter = "INSTITUTION_IDENTIFIER",
......
...@@ -2,7 +2,8 @@ app: ...@@ -2,7 +2,8 @@ app:
elastic: elastic:
host: ${ELASTIC_HOST:?system} host: ${ELASTIC_HOST:?system}
port: ${ELASTIC_PORT:?system} port: ${ELASTIC_PORT:?system}
index: ${ELASTIC_INDEX:?system} documentsIndex: ${DOCUMENTS_INDEX:?system}
institutionIndex: ${INSTITUTION_INDEX:?system}
media: media:
url: ${MEDIA_SERVER_URL:?system} url: ${MEDIA_SERVER_URL:?system}
institutionTypeLabelsPath: "/configs/institution_types/labels.csv" institutionTypeLabelsPath: "/configs/institution_types/labels.csv"
......
...@@ -18,6 +18,7 @@ import org.junit.jupiter.api.Test ...@@ -18,6 +18,7 @@ import org.junit.jupiter.api.Test
import org.junit.jupiter.api.TestInstance import org.junit.jupiter.api.TestInstance
import org.junit.jupiter.api.assertAll import org.junit.jupiter.api.assertAll
import org.memobase.helpers.ElasticSearchWrapper import org.memobase.helpers.ElasticSearchWrapper
import org.memobase.helpers.KEYS
import org.memobase.model.FacetContainer import org.memobase.model.FacetContainer
import org.memobase.model.LanguageContainer import org.memobase.model.LanguageContainer
...@@ -28,7 +29,7 @@ class TestElasticSearchWrapper { ...@@ -28,7 +29,7 @@ class TestElasticSearchWrapper {
private val host = "localhost" private val host = "localhost"
private val port = 8080 private val port = 8080
private val documentsIndex = "documents-v17" private val documentsIndex = "documents-v17"
private val institutionIndex = "institutions-v1"
private val client: RestHighLevelClient = connect() private val client: RestHighLevelClient = connect()
...@@ -62,6 +63,35 @@ class TestElasticSearchWrapper { ...@@ -62,6 +63,35 @@ class TestElasticSearchWrapper {
} }
} }
@Test
@Disabled
fun `test get institution name`() {
val props = Properties()
props.setProperty(KEYS.SettingsProps.documentsIndex, documentsIndex)
props.setProperty(KEYS.SettingsProps.institutionIndex, institutionIndex)
val wrapper = ElasticSearchWrapper(props, client, TestUtilities.translationMappers)
val result = wrapper.getInstitutionName("aag")
assertAll("",
{
assertThat(result)
.isEqualTo(
FacetContainer(
LanguageContainer(
listOf("Staatsarchiv des Kantons Aargau"),
listOf("Archives de l’Etat Argovie"),
listOf("Archivio cantonale del Argovia"),
emptyList()
),
"aag",
emptyList()
)
)
}
)
}
/** /**
* This test can only be run locally. Create a tunnel to * This test can only be run locally. Create a tunnel to
* ssh -L 8080:mb-es1:8080 swissbib@mb-es1.memobase.unibas.ch * ssh -L 8080:mb-es1:8080 swissbib@mb-es1.memobase.unibas.ch
...@@ -71,7 +101,8 @@ class TestElasticSearchWrapper { ...@@ -71,7 +101,8 @@ class TestElasticSearchWrapper {
@Disabled @Disabled
fun `test getDocumentTypesFromRecords`() { fun `test getDocumentTypesFromRecords`() {
val props = Properties() val props = Properties()
props.setProperty("elastic.index", documentsIndex) props.setProperty(KEYS.SettingsProps.documentsIndex, documentsIndex)
props.setProperty(KEYS.SettingsProps.institutionIndex, institutionIndex)
val wrapper = ElasticSearchWrapper(props, client, TestUtilities.translationMappers) val wrapper = ElasticSearchWrapper(props, client, TestUtilities.translationMappers)
...@@ -83,16 +114,18 @@ class TestElasticSearchWrapper { ...@@ -83,16 +114,18 @@ class TestElasticSearchWrapper {
}, },
{ {
assertThat(results[0]) assertThat(results[0])
.isEqualTo(FacetContainer( .isEqualTo(
LanguageContainer( FacetContainer(
listOf("Fotografie"), LanguageContainer(
listOf("Photographie"), listOf("Fotografie"),
listOf("Fotografia"), listOf("Photographie"),
listOf("Fotografia"),
emptyList()
),
"Foto",
emptyList() emptyList()
), )
"Foto", )
emptyList()
))
} }
) )
} }
......
...@@ -30,9 +30,6 @@ class TestInstitutionSearchDoc { ...@@ -30,9 +30,6 @@ class TestInstitutionSearchDoc {
return File("$dataPath/$fileName").readText(Charset.defaultCharset()) return File("$dataPath/$fileName").readText(Charset.defaultCharset())
} }
private val dateRegex = Regex("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}")
@Test @Test
@Disabled @Disabled
fun `test institution search doc with production es client`() { fun `test institution search doc with production es client`() {
...@@ -110,8 +107,8 @@ class TestInstitutionSearchDoc { ...@@ -110,8 +107,8 @@ class TestInstitutionSearchDoc {
val reportValue = reader.readValue(report.value(), Report::class.java) val reportValue = reader.readValue(report.value(), Report::class.java)
val key = record.key() val key = record.key()
val value = record.value().replace(dateRegex, "2020") val value = record.value().replace(TestUtilities.dateRegex, "2020")
val resultValue = readFile("output.json").replace(dateRegex, "2020") val resultValue = readFile("output.json").replace(TestUtilities.dateRegex, "2020")
assertAll("", assertAll("",
{ {
......
...@@ -32,8 +32,6 @@ class TestRecordSetSearchDoc { ...@@ -32,8 +32,6 @@ class TestRecordSetSearchDoc {