In order to mitigate against the brute force attacks against Gitlab accounts, we are moving to all edu-ID Logins. We would like to remind you to link your account with your edu-id. Login will be possible only by edu-ID after November 30, 2021. Here you can find the instructions for linking your account.

If you don't have a SWITCH edu-ID, you can create one with this guide here

kind regards

This Server has been upgraded to GitLab release 14.2.6

Commit cd752478 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Add record sets search doc generation.

parent ad271354
......@@ -64,6 +64,7 @@ dependencies {
// https://mvnrepository.com/artifact/org.apache.kafka/kafka-streams-test-utils
testCompile group: 'org.apache.kafka', name: 'kafka-streams-test-utils', version: kafkaV
testImplementation "io.mockk:mockk:1.10.0"
}
compileKotlin {
......
......@@ -30,8 +30,13 @@ spec:
volumeMounts:
- name: instituion-type-labels
mountPath: "/configs/institution_types/"
- name: document-type-labels
mountPath: "/configs/document_types/"
volumes:
- name: instituion-type-labels
configMap:
name: "{{ .Values.instutionTypeLabels }}"
- name: document-type-labels
configMap:
name: "{{ .Values.documentTypeLabels }}"
restartPolicy: Always
......@@ -13,5 +13,6 @@ inputTopic: search-doc-input-documents
reportingTopic: postprocessing-reporting
instutionTypeLabels: institution-type-labels
documentTypeLabels: document-type-labels
mediaServerUrl: https://media.memobase.k8s.unibas.ch/memo/
\ No newline at end of file
......@@ -29,10 +29,12 @@ import org.apache.kafka.streams.Topology
import org.apache.kafka.streams.kstream.KStream
import org.apache.kafka.streams.kstream.Predicate
import org.apache.logging.log4j.LogManager
import org.memobase.helpers.Default
import org.memobase.helpers.ElasticSearchWrapper
import org.memobase.helpers.JSON
import org.memobase.helpers.KEYS
import org.memobase.model.DocumentsSearchDoc
import org.memobase.model.InstitutionSearchDoc
import org.memobase.model.RecordSetSearchDoc
import org.memobase.model.Schema
class KafkaTopology(private val settings: SettingsLoader) {
......@@ -45,6 +47,10 @@ class KafkaTopology(private val settings: SettingsLoader) {
private val institutionSearchDoc =
InstitutionSearchDocBuilder(appSettings.getProperty(KEYS.SettingsProps.institutionTypeLabelsPath), appSettings)
private val elasticSearchWrapper = ElasticSearchWrapper(settings.appSettings)
private val recordSetSearchDocBuilder =
RecordSetSearchDocBuilder(elasticSearchWrapper)
private val jsonWriter = ObjectMapper().registerKotlinModule().writer()
fun build(): Topology {
......@@ -83,7 +89,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
Report(readOnlyKey, ReportStatus.success, "Transformed message into search doc.", Service.name)
)
} catch (ex: InvalidInputException) {
Pair(Default.institutionSearchDoc, Report(
Pair(InstitutionSearchDoc.DEFAULT, Report(
readOnlyKey,
ReportStatus.warning,
ex.localizedMessage,
......@@ -92,6 +98,23 @@ class KafkaTopology(private val settings: SettingsLoader) {
}
outputStreams(institutionStream)
val recordSetStream = branchedStream[2]
.mapValues { readOnlyKey, value ->
try {
Pair(
recordSetSearchDocBuilder.transform(readOnlyKey, value),
Report(readOnlyKey, ReportStatus.success, "Transformed message into search doc.", Service.name)
)
} catch (ex: InvalidInputException) {
Pair(RecordSetSearchDoc.DEFAULT, Report(
readOnlyKey,
ReportStatus.warning,
ex.localizedMessage,
Service.name))
}
}
outputStreams(recordSetStream)
branchedStream[3]
.mapValues { readOnlyKey, value ->
Report(
......@@ -111,7 +134,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
.to(reportTopic)
stream
.filterNot { _, value -> value.second.status == "FAILURE" }
.filterNot { _, value -> value.second.status == ReportStatus.fatal }
.mapValues { value -> value.first }
.mapValues { value ->
val out = StringWriter()
......
/*
* search-doc-service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import com.beust.klaxon.JsonObject
import org.apache.logging.log4j.LogManager
import org.memobase.helpers.Date
import org.memobase.helpers.ElasticSearchWrapper
import org.memobase.helpers.Extract
import org.memobase.helpers.KEYS
import org.memobase.model.FacetContainer
import org.memobase.model.LanguageContainer
import org.memobase.model.RecordSetSearchDoc
import org.memobase.model.Schema
import org.memobase.model.IntegerRange
class RecordSetSearchDocBuilder(private val elasticSearchWrapper: ElasticSearchWrapper) {
private val log = LogManager.getLogger("RecordSetSearchDocBuilder")
fun transform(key: String, input: Map<String, JsonObject>): Schema {
val recordSet =
input["recordSet"] ?: throw InvalidInputException("No recordSet entity found in message $key.")
val identifiers = mutableListOf<JsonObject>()
input.values.forEach {
when {
it[KEYS.ricoType] == KEYS.IdentifierType.main -> {
identifiers.add(it)
}
}
}
val name = extractLanguageContainer(recordSet[KEYS.title], "NoNameFound")
val description = extractLanguageContainer(recordSet[KEYS.descriptiveNote], "NoDescriptionFound")
val id = Extract.extractIdValue(identifiers, KEYS.IdentifierType.main) ?: "NoIdentifierFound"
val institution = recordSet[KEYS.heldBy] as String?
if (institution != null) {
// TODO:
}
return RecordSetSearchDoc(
recordSetId = id,
isPublished = recordSet[KEYS.isPublished].let {
when (it) {
is String -> it.toBoolean()
else -> {
log.error("Found no isPublished property on record set $key. Set to false.")
false
}
}
},
scopeAndContent = description,
periodOfTimeAsYear = IntegerRange(1920, 2020),
institution = FacetContainer(LanguageContainer.placeholder("NoNameInstitution"), filter = institution, facet = emptyList()),
supportedByMemoriav = recordSet[KEYS.sponsoredBy] != null,
name = name,
documentType = elasticSearchWrapper.getDocumentTypesFromRecords(id, KEYS.QueryFields.recordSetFacet),
keyVisualLink = recordSet[KEYS.wikidataImage].let { if (it != null) it as String else "NoKeyVisualLinkDefined" },
numberOfDocuments = elasticSearchWrapper.countNumberOfDocuments(id),
lastUpdatedDate = Date.now,
languageOfMetadata = FacetContainer(LanguageContainer.placeholder("Deutsch"), filter = null, facet = emptyList())
)
}
private fun extractLanguageContainer(value: Any?, placeholder: String): LanguageContainer {
return Extract.languageContainer("record set", value).let { items ->
when {
items.isEmpty() -> {
LanguageContainer.placeholder(placeholder)
}
items.size == 1 -> {
items[0]
}
else -> {
items.reduce { acc, languageContainer -> acc.merge(languageContainer) }
}
}
}
}
}
......@@ -33,6 +33,7 @@ class Service(file: String = "app.yml") {
val settings = SettingsLoader(
listOf(
KEYS.SettingsProps.institutionTypeLabelsPath,
KEYS.SettingsProps.documentTypeLabelsPath,
KEYS.SettingsProps.mediaUrl,
KEYS.SettingsProps.elasticHost,
KEYS.SettingsProps.elasticPort,
......
......@@ -4,5 +4,5 @@ import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
object Date {
val now = LocalDateTime.now().format(DateTimeFormatter.ISO_DATE_TIME)
val now: String = LocalDateTime.now().format(DateTimeFormatter.ISO_DATE_TIME)
}
\ No newline at end of file
package org.memobase.helpers
import org.memobase.model.*
object Default {
val institutionSearchDoc = InstitutionSearchDoc(
"UnknownId",
false,
LanguageContainer.EMPTY,
LanguageContainer.EMPTY,
emptyList(),
emptyList(),
emptyList(),
emptyList(),
emptyList(),
emptyList(),
"",
"",
0,
0,
"",
""
)
}
\ No newline at end of file
package org.memobase.helpers
import org.memobase.model.FacetContainer
class DocumentTypeMapper(path: String) {
private val labels = LoadFile.readLabelFile(path)
fun getValue(code: String): FacetContainer {
return labels[code] ?: FacetContainer.DEFAULT
}
}
\ No newline at end of file
package org.memobase.helpers
import java.util.Properties
import org.apache.http.HttpHost
import org.apache.logging.log4j.LogManager
import org.elasticsearch.ElasticsearchException
import org.elasticsearch.action.admin.indices.alias.get.GetAliasesRequest
import org.elasticsearch.action.search.ClearScrollRequest
import org.elasticsearch.action.search.SearchRequest
import org.elasticsearch.action.search.SearchScrollRequest
import org.elasticsearch.client.RequestOptions
import org.elasticsearch.client.RestClient
import org.elasticsearch.client.RestHighLevelClient
import org.elasticsearch.client.core.CountRequest
import org.elasticsearch.client.indices.GetIndexRequest
import org.elasticsearch.common.unit.TimeValue
import org.elasticsearch.index.query.QueryBuilders
import org.elasticsearch.index.query.QueryBuilders.termQuery
import org.elasticsearch.search.Scroll
import org.elasticsearch.search.builder.SearchSourceBuilder
import org.memobase.model.FacetContainer
import java.net.SocketTimeoutException
import java.util.*
import org.elasticsearch.search.sort.SortBuilders
/**
* This class facilitates a connection to the Elasticsearch cluster and offers convenience functions to retrieve
......@@ -21,7 +32,8 @@ class ElasticSearchWrapper(settings: Properties) {
private val host = settings.getProperty(KEYS.SettingsProps.elasticHost)
private val port = settings.getProperty(KEYS.SettingsProps.elasticPort).toInt()
private val documentsIndex = settings.getProperty(KEYS.SettingsProps.elasticIndex)
private val client = connect()
private val documentTypeMapper = DocumentTypeMapper(settings.getProperty(KEYS.SettingsProps.documentTypeLabelsPath))
private var client: RestHighLevelClient? = null
/**
* Establishes a connection to the client and ensures, that the index is present. The index may be an alias
......@@ -38,16 +50,17 @@ class ElasticSearchWrapper(settings: Properties) {
*
* TODO: In the future find a better solution.
*/
private fun connect(): RestHighLevelClient? {
fun connect(): RestHighLevelClient? {
return try {
val c = RestHighLevelClient(
RestClient.builder(
HttpHost(host, port)
))
RestClient.builder(
HttpHost(host, port)
)
)
val indexExists = c.indices().exists(GetIndexRequest(documentsIndex), RequestOptions.DEFAULT)
val aliasExists = c.indices().existsAlias(GetAliasesRequest(documentsIndex), RequestOptions.DEFAULT)
if (!c.indices().exists(GetIndexRequest(documentsIndex), RequestOptions.DEFAULT) ||
!c.indices().existsAlias(GetAliasesRequest(documentsIndex), RequestOptions.DEFAULT)) {
if (!indexExists && !aliasExists) {
log.error("Could not find the index or alias defined in the configuration: $documentsIndex.")
null
} else {
......@@ -57,6 +70,9 @@ class ElasticSearchWrapper(settings: Properties) {
} catch (ex: ElasticsearchException) {
log.error(ex.detailedMessage)
null
} catch (ex: SocketTimeoutException) {
log.error(ex.localizedMessage)
null
}
}
......@@ -64,6 +80,9 @@ class ElasticSearchWrapper(settings: Properties) {
* Counts the number of documents attached to a specific record set.
*/
fun countNumberOfDocuments(recordSetIdentifier: String): Int {
if (client == null)
connect()
return client.let {
if (it == null) {
log.error("Not connected to an index. Count is at zero! Restart service to retry connection.")
......@@ -71,12 +90,13 @@ class ElasticSearchWrapper(settings: Properties) {
} else {
val request = CountRequest(documentsIndex)
request.query(
QueryBuilders.termQuery(
"recordSet.facet", recordSetIdentifier
)
termQuery(
"recordSet.facet", recordSetIdentifier
)
)
try {
val response = it.count(request, RequestOptions.DEFAULT
val response = it.count(
request, RequestOptions.DEFAULT
)
response.count.toInt()
} catch (ex: ElasticsearchException) {
......@@ -86,4 +106,69 @@ class ElasticSearchWrapper(settings: Properties) {
}
}
}
/**
* Counts the number of documents attached to a specific record set.
*/
fun getDocumentTypesFromRecords(recordSetIdentifier: String, queryField: String): List<FacetContainer> {
if (client == null)
client = connect()
return client.let {
if (it == null) {
log.error("Could not connect to elasticsearch. Try again.")
emptyList()
} else {
try {
val resultFacets = mutableListOf<FacetContainer>()
val typeSet = mutableSetOf<String>()
val scroll = Scroll(TimeValue.timeValueMinutes(1L))
val searchRequest = SearchRequest(documentsIndex)
searchRequest.scroll(scroll)
val searchSourceBuilder = SearchSourceBuilder()
searchSourceBuilder.fetchSource(
arrayOf(
"id", "type"
), emptyArray<String>()
)
searchSourceBuilder.query(
termQuery(
queryField, recordSetIdentifier
)
)
searchRequest.source(searchSourceBuilder)
var searchResponse = it.search(searchRequest, RequestOptions.DEFAULT)
var scrollId = searchResponse.scrollId
var searchHits = searchResponse.hits.hits
while (searchHits != null && searchHits.isNotEmpty()) {
val scrollRequest = SearchScrollRequest(scrollId)
scrollRequest.scroll(scroll)
searchResponse = it.scroll(scrollRequest, RequestOptions.DEFAULT)
scrollId = searchResponse.scrollId
searchHits = searchResponse.hits.hits
for (hit in searchHits) {
val type = hit.sourceAsMap["type"]
if (type != null) {
type as String
if (!typeSet.contains(type)) {
resultFacets.add(documentTypeMapper.getValue(type))
typeSet.add(type)
}
}
}
}
val clearScrollRequest = ClearScrollRequest()
clearScrollRequest.addScrollId(scrollId)
it.clearScroll(clearScrollRequest, RequestOptions.DEFAULT)
resultFacets
} catch (ex: ElasticsearchException) {
log.error(ex.detailedMessage)
emptyList<FacetContainer>()
}
}
}
}
}
\ No newline at end of file
......@@ -41,7 +41,7 @@ object JSON {
if (it[atType] == NS.rico + Record) {
Pair(record, it)
} else if (it[atType] == NS.rico + RecordSet) {
Pair(record, it)
Pair(recordSet, it)
} else if (it[atType] == NS.rico + CorporateBody && it[type] == memobaseInstitution) {
Pair(institution, it)
} else {
......
......@@ -20,6 +20,7 @@ package org.memobase.helpers
object KEYS {
object SettingsProps {
const val documentTypeLabelsPath = "documentTypeLabelsPath"
const val mediaUrl = "media.url"
const val institutionTypeLabelsPath = "institutionTypeLabelsPath"
const val elasticHost = "elastic.host"
......@@ -70,6 +71,7 @@ object KEYS {
// namespace rda:
const val placeOfCapture = "P60556"
const val producer = "P60441"
const val sponsoredBy = "P60451"
// namespace skos:
const val prefLabel = "prefLabel"
......@@ -100,6 +102,11 @@ object KEYS {
const val missingLabelIt = "GALATEO MANCANTE"
const val missingLabelEn = "MISSING LABEL"
object QueryFields {
const val recordSetFacet = "recordSet.facet"
const val institutionFacet = "institution.facet"
}
object TitleTypes {
const val main = "main"
const val series = "series"
......
......@@ -2,6 +2,7 @@ package org.memobase.model
import com.fasterxml.jackson.annotation.JsonIgnore
import com.fasterxml.jackson.annotation.JsonInclude
import org.memobase.helpers.Date
@JsonInclude(JsonInclude.Include.NON_EMPTY)
data class InstitutionSearchDoc(
......@@ -30,4 +31,26 @@ data class InstitutionSearchDoc(
val numberOfDocuments: Int,
val teaserColor: String,
val teaserColorComputed: String
) : Schema(institutionId)
\ No newline at end of file
) : Schema(institutionId) {
companion object {
val DEFAULT = InstitutionSearchDoc(
"DefaultInsitutionId",
false,
LanguageContainer.EMPTY,
LanguageContainer.EMPTY,
emptyList(),
emptyList(),
emptyList(),
emptyList(),
emptyList(),
emptyList(),
Date.now,
"",
0,
0,
"",
""
)
}
}
\ No newline at end of file
package org.memobase.model
data class IntegerRange(
val gte: Int,
val lte: Int
)
\ No newline at end of file
package org.memobase.model
import com.fasterxml.jackson.annotation.JsonInclude
@JsonInclude(JsonInclude.Include.NON_EMPTY)
data class RecordSet(
val recordSetIdentifier: String,
val isSupported: Boolean,
val documentType: List<LanguageContainer>,
val timePeriod: TimePeriod,
val keyVisualLink: String,
val institution: LanguageContainer,
val institutionId: String,
val languages: List<LanguageContainer>,
val description: LanguageContainer,
val numberOfDocuments: Int,
val latestUpdate: String
) : Schema(recordSetIdentifier)
......@@ -12,11 +12,11 @@ data class RecordSetSearchDoc(
val isPublished: Boolean,
// Facets
val documentType: FacetContainer,
val documentType: List<FacetContainer>,
val supportedByMemoriav: Boolean,
val languageOfMetadata: FacetContainer,
val institution: FacetContainer,
val periodOfTimeAsYear: String,
val periodOfTimeAsYear: IntegerRange,
// Sort
val lastUpdatedDate: String,
......@@ -34,12 +34,11 @@ data class RecordSetSearchDoc(
recordSetId = "NoRecordSetId",
name = LanguageContainer.placeholder("TEST_RECORD_SET"),
isPublished = false,
documentType = FacetContainer(
documentType = listOf(FacetContainer(
LanguageContainer.placeholder("TEST DOCUMENT TYPE"),
filter = "",
facet = emptyList()
),
)),
supportedByMemoriav = true,
languageOfMetadata = FacetContainer(
LanguageContainer.placeholder("TEST LANGUAGE"),
......@@ -51,7 +50,7 @@ data class RecordSetSearchDoc(
filter = "INSTITUTION_IDENTIFIER",
facet = emptyList()
),
periodOfTimeAsYear = "2020",
periodOfTimeAsYear = IntegerRange(2000, 2020),
scopeAndContent = LanguageContainer.placeholder("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."),
lastUpdatedDate = "2020-11-20T10:29:01.128",
keyVisualLink = "https://mb-wf1.memobase.unibas.ch/sites/default/files/styles/teaser/public/2020-10/1.jpg?itok=5ncVBnVQ",
......
......@@ -6,6 +6,7 @@ app:
media:
url: ${MEDIA_SERVER_URL:?system}
institutionTypeLabelsPath: "/configs/institution_types/labels.csv"
documentTypeLabelsPath: "/configs/document_types/labels.csv"
kafka:
streams:
bootstrap.servers: ${KAFKA_BOOTSTRAP_SERVERS:?system}
......
package org.memobase
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.TestInstance
import org.junit.jupiter.api.assertAll
import org.memobase.helpers.ElasticSearchWrapper
import java.util.Properties
import org.junit.jupiter.api.Disabled
import org.memobase.helpers.KEYS
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
class TestElasticSearchWrapper {
private val documentTypeLabelPath = "src/test/resources/configs/document-type-labels.csv"
@Test
// @Disabled
fun `test connect`() {
val props = Properties()
props.setProperty("elastic.host", "localhost")
props.setProperty("elastic.port", "8080")
props.setProperty("elastic.index", "documents-v14")
props.setProperty(KEYS.SettingsProps.documentTypeLabelsPath, documentTypeLabelPath)