In order to mitigate against the brute force attacks against Gitlab accounts, we are moving to all edu-ID Logins. We would like to remind you to link your account with your edu-id. Login will be possible only by edu-ID after November 30, 2021. Here you can find the instructions for linking your account.

If you don't have a SWITCH edu-ID, you can create one with this guide here

kind regards

ElasticSearchWrapper.kt 5.83 KB
Newer Older
Jonas Waeber's avatar
Jonas Waeber committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 * search-doc-service
 * Copyright (C) 2020 Memoriav
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
18
19
package org.memobase.helpers

20
21
import com.beust.klaxon.Klaxon
import com.beust.klaxon.KlaxonException
Jonas Waeber's avatar
Jonas Waeber committed
22
import java.util.Properties
23
24
import org.apache.logging.log4j.LogManager
import org.elasticsearch.ElasticsearchException
25
26
27
import org.elasticsearch.action.search.ClearScrollRequest
import org.elasticsearch.action.search.SearchRequest
import org.elasticsearch.action.search.SearchScrollRequest
28
29
30
import org.elasticsearch.client.RequestOptions
import org.elasticsearch.client.RestHighLevelClient
import org.elasticsearch.client.core.CountRequest
31
32
33
34
35
36
import org.elasticsearch.common.unit.TimeValue
import org.elasticsearch.index.query.QueryBuilders.termQuery
import org.elasticsearch.search.Scroll
import org.elasticsearch.search.builder.SearchSourceBuilder
import org.memobase.model.FacetContainer

37

Jonas Waeber's avatar
Jonas Waeber committed
38
39
40
41
/**
 * This class facilitates a connection to the Elasticsearch cluster and offers convenience functions to retrieve
 * the necessary data.
 */
42
43
44
45
46
class ElasticSearchWrapper(
    settings: Properties,
    private val client: RestHighLevelClient,
    private val translationMappers: TranslationMappers
) {
47
48
49
    private val log = LogManager.getLogger("ElasticSearchWrapper")
    private val documentsIndex = settings.getProperty(KEYS.SettingsProps.elasticIndex)

Jonas Waeber's avatar
Jonas Waeber committed
50

51
52
    private val klaxon = Klaxon()

53

Jonas Waeber's avatar
Jonas Waeber committed
54
55
    /**
     * Counts the number of documents attached to a specific record set.
Jonas Waeber's avatar
Jonas Waeber committed
56
57
58
59
     *
     * @param recordSetIdentifier The memobase identifier of the record set.
     *
     * @return Number of Documents
Jonas Waeber's avatar
Jonas Waeber committed
60
     */
61
    fun countNumberOfDocuments(recordSetIdentifier: String): Int {
Jonas Waeber's avatar
Jonas Waeber committed
62
        log.info("Counting documents for record set $recordSetIdentifier.")
63
64
65
66
67
68
69
70
71
72
        val request = CountRequest(documentsIndex)
        request.query(
            termQuery(
                "recordSet.facet", recordSetIdentifier
            )
        )
        return try {
            val response = client.count(
                request, RequestOptions.DEFAULT
            )
Jonas Waeber's avatar
Jonas Waeber committed
73
74
75
            val count = response.count.toInt()
            log.info("Found $count documents for record set $recordSetIdentifier.")
            count
76
77
78
        } catch (ex: ElasticsearchException) {
            log.error(ex.detailedMessage)
            0
79
80
        }
    }
81
82

    /**
Jonas Waeber's avatar
Jonas Waeber committed
83
84
85
86
87
88
     * Collects all the document types present in the index for a specific record set.
     *
     * @param recordSetIdentifier: The memobase identifier of the record set.
     * @param queryField: Name of the field the identifier is stored in.
     *
     * @return A list of unique document type facet containers.
89
90
     */
    fun getDocumentTypesFromRecords(recordSetIdentifier: String, queryField: String): List<FacetContainer> {
91
        return try {
Jonas Waeber's avatar
Jonas Waeber committed
92
            log.info("Attempting to load document type for $recordSetIdentifier in field $queryField.")
93
94
95
96
97
98
99
100
101
102
103
            val resultFacets = mutableListOf<FacetContainer>()
            val typeSet = mutableSetOf<String>()
            val scroll = Scroll(TimeValue.timeValueMinutes(1L))
            val searchRequest = SearchRequest(documentsIndex)
            searchRequest.scroll(scroll)
            val searchSourceBuilder = SearchSourceBuilder()
            searchSourceBuilder.fetchSource(
                arrayOf(
                    "id", "type.filter"
                ), emptyArray<String>()
            )
104

105
106
107
108
109
110
111
112
113
            searchSourceBuilder.query(
                termQuery(
                    queryField, recordSetIdentifier
                )
            )
            searchRequest.source(searchSourceBuilder)
            var searchResponse = client.search(searchRequest, RequestOptions.DEFAULT)
            var scrollId = searchResponse.scrollId
            var searchHits = searchResponse.hits.hits
114

115
116
117
118
119
120
121
122
123
124
125
126
127
128
            while (searchHits != null && searchHits.isNotEmpty()) {
                val scrollRequest = SearchScrollRequest(scrollId)
                scrollRequest.scroll(scroll)
                searchResponse = client.scroll(scrollRequest, RequestOptions.DEFAULT)
                scrollId = searchResponse.scrollId
                searchHits = searchResponse.hits.hits
                for (hit in searchHits) {
                    val source = hit.sourceAsString
                    try {
                        val document = klaxon.parse<DocumentResponseSource>(source)
                        if (document != null) {
                            if (!typeSet.contains(document.type.filter)) {
                                resultFacets.add(translationMappers.getDocumentType(document.type.filter))
                                typeSet.add(document.type.filter)
129
130
                            }
                        }
131
132
                    } catch (ex: KlaxonException) {
                        log.error("Unable to parse $source from index $documentsIndex.")
133
134
135
                    }
                }
            }
136
137
138
            val clearScrollRequest = ClearScrollRequest()
            clearScrollRequest.addScrollId(scrollId)
            client.clearScroll(clearScrollRequest, RequestOptions.DEFAULT)
Jonas Waeber's avatar
Jonas Waeber committed
139
            log.info("Found the following types $typeSet for record set $recordSetIdentifier.")
140
141
142
143
            resultFacets
        } catch (ex: ElasticsearchException) {
            log.error(ex.detailedMessage)
            emptyList()
144
145
        }
    }
146
}