Commit ba74b75d authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Implements person facet builder.

parent 105918af
/*
* search-doc-service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
object KEYS {
const val atType = "@type"
const val ricoType = "type"
const val firstName = "firstName"
const val lastName = "lastName"
const val name = "name"
const val agentIsTargetOfCreationRelation = "agentIsTargetOfCreationRelation"
const val contributor = "contributor"
const val creator = "creator"
const val Person = "Person"
const val Instantiation = "Instantiation"
}
/*
* search-doc-service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.builders
import com.beust.klaxon.JsonObject
interface IFieldBuilder {
fun filter(jsonObject: JsonObject, map: Map<String, JsonObject>): Boolean
fun append(jsonObject: JsonObject): String
fun build(): List<String>
}
/*
* search-doc-service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.builders
import com.beust.klaxon.JsonObject
import org.memobase.KEYS
import org.memobase.helpers.AsciiFolder
import org.memobase.helpers.Extract
import org.memobase.rdf.NS
class PersonFacetBuilder(private val ricoType: String) : IFieldBuilder {
private val separator = "~"
private val terminator = "#"
private val level1 = "0"
private val level2 = "1"
private val isAlphaChar = Regex("[A-Za-z]")
private val personFacetValues = mutableSetOf<String>()
override fun filter(jsonObject: JsonObject, map: Map<String, JsonObject>): Boolean {
return if (jsonObject[KEYS.atType].let {
when (it) {
is String -> it == NS.rico + KEYS.Person
else -> false
}
}) {
val ids = Extract.identifier(jsonObject[KEYS.agentIsTargetOfCreationRelation])
ids
.map { map[it] }
.map { it?.get(KEYS.ricoType) }
.any { it != null && it is String && it == ricoType }
} else {
false
}
}
override fun append(jsonObject: JsonObject): String {
val name = when {
jsonObject.containsKey("lastName") -> {
jsonObject["lastName"] as String
}
jsonObject.containsKey("name") -> {
jsonObject["name"] as String
}
else -> {
return "Failed to process person ${jsonObject["@id"]} for person facet, because the person does not have a name."
}
}
val displayName = jsonObject["lastName"].let { lastName ->
when (lastName) {
is String -> lastName + jsonObject["firstName"].let { if (it is String) ", $it" else "" }
else -> jsonObject["name"]
}
}
val foldedName = AsciiFolder.foldToASCII(name)
val firstChar = foldedName.first { isAlphaChar.matches(it.toString()) }
val capitalLetter = firstChar.toUpperCase()
personFacetValues.add("$level1$separator$capitalLetter$separator$terminator")
personFacetValues.add("$level2$separator$capitalLetter$separator$displayName$separator$terminator")
return "Successfully added person to facet list."
}
override fun build(): List<String> {
return personFacetValues.toList().sortedBy { v -> v.substring(2) }
}
}
/*
* search-doc-service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.helpers
object AsciiFolder {
fun foldToASCII(
input: String,
sb: StringBuilder = StringBuilder(input.length)
): String {
val end = input.length
for (pos in 0 until end) {
val c = input[pos]
// Quick test: if it's not in range then just keep current character
if (c < '\u0080') {
sb.append(c)
} else {
when (c) {
'\u00C0', '\u00C1', '\u00C2', '\u00C3', '\u00C4', '\u00C5', '\u0100', '\u0102', '\u0104', '\u018F', '\u01CD', '\u01DE', '\u01E0', '\u01FA', '\u0200', '\u0202', '\u0226', '\u023A', '\u1D00', '\u1E00', '\u1EA0', '\u1EA2', '\u1EA4', '\u1EA6', '\u1EA8', '\u1EAA', '\u1EAC', '\u1EAE', '\u1EB0', '\u1EB2', '\u1EB4', '\u1EB6', '\u24B6', '\uFF21' -> sb.append(
'A'
)
'\u00E0', '\u00E1', '\u00E2', '\u00E3', '\u00E4', '\u00E5', '\u0101', '\u0103', '\u0105', '\u01CE', '\u01DF', '\u01E1', '\u01FB', '\u0201', '\u0203', '\u0227', '\u0250', '\u0259', '\u025A', '\u1D8F', '\u1D95', '\u1E01', '\u1E9A', '\u1EA1', '\u1EA3', '\u1EA5', '\u1EA7', '\u1EA9', '\u1EAB', '\u1EAD', '\u1EAF', '\u1EB1', '\u1EB3', '\u1EB5', '\u1EB7', '\u2090', '\u2094', '\u24D0', '\u2C65', '\u2C6F', '\uFF41' -> sb.append(
'a'
)
'\uA732' -> {
sb.append('A')
sb.append('A')
}
'\u00C6', '\u01E2', '\u01FC', '\u1D01' -> {
sb.append('A')
sb.append('E')
}
'\uA734' -> {
sb.append('A')
sb.append('O')
}
'\uA736' -> {
sb.append('A')
sb.append('U')
}
'\uA738', '\uA73A' -> {
sb.append('A')
sb.append('V')
}
'\uA73C' -> {
sb.append('A')
sb.append('Y')
}
'\u249C' -> {
sb.append('(')
sb.append('a')
sb.append(')')
}
'\uA733' -> {
sb.append('a')
sb.append('a')
}
'\u00E6', '\u01E3', '\u01FD', '\u1D02' -> {
sb.append('a')
sb.append('e')
}
'\uA735' -> {
sb.append('a')
sb.append('o')
}
'\uA737' -> {
sb.append('a')
sb.append('u')
}
'\uA739', '\uA73B' -> {
sb.append('a')
sb.append('v')
}
'\uA73D' -> {
sb.append('a')
sb.append('y')
}
'\u0181', '\u0182', '\u0243', '\u0299', '\u1D03', '\u1E02', '\u1E04', '\u1E06', '\u24B7', '\uFF22' -> sb.append(
'B'
)
'\u0180', '\u0183', '\u0253', '\u1D6C', '\u1D80', '\u1E03', '\u1E05', '\u1E07', '\u24D1', '\uFF42' -> sb.append(
'b'
)
'\u249D' -> {
sb.append('(')
sb.append('b')
sb.append(')')
}
'\u00C7', '\u0106', '\u0108', '\u010A', '\u010C', '\u0187', '\u023B', '\u0297', '\u1D04', '\u1E08', '\u24B8', '\uFF23' -> sb.append(
'C'
)
'\u00E7', '\u0107', '\u0109', '\u010B', '\u010D', '\u0188', '\u023C', '\u0255', '\u1E09', '\u2184', '\u24D2', '\uA73E', '\uA73F', '\uFF43' -> sb.append(
'c'
)
'\u249E' -> {
sb.append('(')
sb.append('c')
sb.append(')')
}
'\u00D0', '\u010E', '\u0110', '\u0189', '\u018A', '\u018B', '\u1D05', '\u1D06', '\u1E0A', '\u1E0C', '\u1E0E', '\u1E10', '\u1E12', '\u24B9', '\uA779', '\uFF24' -> sb.append(
'D'
)
'\u00F0', '\u010F', '\u0111', '\u018C', '\u0221', '\u0256', '\u0257', '\u1D6D', '\u1D81', '\u1D91', '\u1E0B', '\u1E0D', '\u1E0F', '\u1E11', '\u1E13', '\u24D3', '\uA77A', '\uFF44' -> sb.append(
'd'
)
'\u01C4', '\u01F1' -> {
sb.append('D')
sb.append('Z')
}
'\u01C5', '\u01F2' -> {
sb.append('D')
sb.append('z')
}
'\u249F' -> {
sb.append('(')
sb.append('d')
sb.append(')')
}
'\u0238' -> {
sb.append('d')
sb.append('b')
}
'\u01C6', '\u01F3', '\u02A3', '\u02A5' -> {
sb.append('d')
sb.append('z')
}
'\u00C8', '\u00C9', '\u00CA', '\u00CB', '\u0112', '\u0114', '\u0116', '\u0118', '\u011A', '\u018E', '\u0190', '\u0204', '\u0206', '\u0228', '\u0246', '\u1D07', '\u1E14', '\u1E16', '\u1E18', '\u1E1A', '\u1E1C', '\u1EB8', '\u1EBA', '\u1EBC', '\u1EBE', '\u1EC0', '\u1EC2', '\u1EC4', '\u1EC6', '\u24BA', '\u2C7B', '\uFF25' -> sb.append(
'E'
)
'\u00E8', '\u00E9', '\u00EA', '\u00EB', '\u0113', '\u0115', '\u0117', '\u0119', '\u011B', '\u01DD', '\u0205', '\u0207', '\u0229', '\u0247', '\u0258', '\u025B', '\u025C', '\u025D', '\u025E', '\u029A', '\u1D08', '\u1D92', '\u1D93', '\u1D94', '\u1E15', '\u1E17', '\u1E19', '\u1E1B', '\u1E1D', '\u1EB9', '\u1EBB', '\u1EBD', '\u1EBF', '\u1EC1', '\u1EC3', '\u1EC5', '\u1EC7', '\u2091', '\u24D4', '\u2C78', '\uFF45' -> sb.append(
'e'
)
'\u24A0' -> {
sb.append('(')
sb.append('e')
sb.append(')')
}
'\u0191', '\u1E1E', '\u24BB', '\uA730', '\uA77B', '\uA7FB', '\uFF26' -> sb.append('F')
'\u0192', '\u1D6E', '\u1D82', '\u1E1F', '\u1E9B', '\u24D5', '\uA77C', '\uFF46' -> sb.append('f')
'\u24A1' -> {
sb.append('(')
sb.append('f')
sb.append(')')
}
'\uFB00' -> {
sb.append('f')
sb.append('f')
}
'\uFB03' -> {
sb.append('f')
sb.append('f')
sb.append('i')
}
'\uFB04' -> {
sb.append('f')
sb.append('f')
sb.append('l')
}
'\uFB01' -> {
sb.append('f')
sb.append('i')
}
'\uFB02' -> {
sb.append('f')
sb.append('l')
}
'\u011C', '\u011E', '\u0120', '\u0122', '\u0193', '\u01E4', '\u01E5', '\u01E6', '\u01E7', '\u01F4', '\u0262', '\u029B', '\u1E20', '\u24BC', '\uA77D', '\uA77E', '\uFF27' -> sb.append(
'G'
)
'\u011D', '\u011F', '\u0121', '\u0123', '\u01F5', '\u0260', '\u0261', '\u1D77', '\u1D79', '\u1D83', '\u1E21', '\u24D6', '\uA77F', '\uFF47' -> sb.append(
'g'
)
'\u24A2' -> {
sb.append('(')
sb.append('g')
sb.append(')')
}
'\u0124', '\u0126', '\u021E', '\u029C', '\u1E22', '\u1E24', '\u1E26', '\u1E28', '\u1E2A', '\u24BD', '\u2C67', '\u2C75', '\uFF28' -> sb.append(
'H'
)
'\u0125', '\u0127', '\u021F', '\u0265', '\u0266', '\u02AE', '\u02AF', '\u1E23', '\u1E25', '\u1E27', '\u1E29', '\u1E2B', '\u1E96', '\u24D7', '\u2C68', '\u2C76', '\uFF48' -> sb.append(
'h'
)
'\u01F6' -> {
sb.append('H')
sb.append('V')
}
'\u24A3' -> {
sb.append('(')
sb.append('h')
sb.append(')')
}
'\u0195' -> {
sb.append('h')
sb.append('v')
}
'\u00CC', '\u00CD', '\u00CE', '\u00CF', '\u0128', '\u012A', '\u012C', '\u012E', '\u0130', '\u0196', '\u0197', '\u01CF', '\u0208', '\u020A', '\u026A', '\u1D7B', '\u1E2C', '\u1E2E', '\u1EC8', '\u1ECA', '\u24BE', '\uA7FE', '\uFF29' -> sb.append(
'I'
)
'\u00EC', '\u00ED', '\u00EE', '\u00EF', '\u0129', '\u012B', '\u012D', '\u012F', '\u0131', '\u01D0', '\u0209', '\u020B', '\u0268', '\u1D09', '\u1D62', '\u1D7C', '\u1D96', '\u1E2D', '\u1E2F', '\u1EC9', '\u1ECB', '\u2071', '\u24D8', '\uFF49' -> sb.append(
'i'
)
'\u0132' -> {
sb.append('I')
sb.append('J')
}
'\u24A4' -> {
sb.append('(')
sb.append('i')
sb.append(')')
}
'\u0133' -> {
sb.append('i')
sb.append('j')
}
'\u0134', '\u0248', '\u1D0A', '\u24BF', '\uFF2A' -> sb.append('J')
'\u0135', '\u01F0', '\u0237', '\u0249', '\u025F', '\u0284', '\u029D', '\u24D9', '\u2C7C', '\uFF4A' -> sb.append(
'j'
)
'\u24A5' -> {
sb.append('(')
sb.append('j')
sb.append(')')
}
'\u0136', '\u0198', '\u01E8', '\u1D0B', '\u1E30', '\u1E32', '\u1E34', '\u24C0', '\u2C69', '\uA740', '\uA742', '\uA744', '\uFF2B' -> sb.append(
'K'
)
'\u0137', '\u0199', '\u01E9', '\u029E', '\u1D84', '\u1E31', '\u1E33', '\u1E35', '\u24DA', '\u2C6A', '\uA741', '\uA743', '\uA745', '\uFF4B' -> sb.append(
'k'
)
'\u24A6' -> {
sb.append('(')
sb.append('k')
sb.append(')')
}
'\u0139', '\u013B', '\u013D', '\u013F', '\u0141', '\u023D', '\u029F', '\u1D0C', '\u1E36', '\u1E38', '\u1E3A', '\u1E3C', '\u24C1', '\u2C60', '\u2C62', '\uA746', '\uA748', '\uA780', '\uFF2C' -> sb.append(
'L'
)
'\u013A', '\u013C', '\u013E', '\u0140', '\u0142', '\u019A', '\u0234', '\u026B', '\u026C', '\u026D', '\u1D85', '\u1E37', '\u1E39', '\u1E3B', '\u1E3D', '\u24DB', '\u2C61', '\uA747', '\uA749', '\uA781', '\uFF4C' -> sb.append(
'l'
)
'\u01C7' -> {
sb.append('L')
sb.append('J')
}
'\u1EFA' -> {
sb.append('L')
sb.append('L')
}
'\u01C8' -> {
sb.append('L')
sb.append('j')
}
'\u24A7' -> {
sb.append('(')
sb.append('l')
sb.append(')')
}
'\u01C9' -> {
sb.append('l')
sb.append('j')
}
'\u1EFB' -> {
sb.append('l')
sb.append('l')
}
'\u02AA' -> {
sb.append('l')
sb.append('s')
}
'\u02AB' -> {
sb.append('l')
sb.append('z')
}
'\u019C', '\u1D0D', '\u1E3E', '\u1E40', '\u1E42', '\u24C2', '\u2C6E', '\uA7FD', '\uA7FF', '\uFF2D' -> sb.append(
'M'
)
'\u026F', '\u0270', '\u0271', '\u1D6F', '\u1D86', '\u1E3F', '\u1E41', '\u1E43', '\u24DC', '\uFF4D' -> sb.append(
'm'
)
'\u24A8' -> {
sb.append('(')
sb.append('m')
sb.append(')')
}
'\u00D1', '\u0143', '\u0145', '\u0147', '\u014A', '\u019D', '\u01F8', '\u0220', '\u0274', '\u1D0E', '\u1E44', '\u1E46', '\u1E48', '\u1E4A', '\u24C3', '\uFF2E' -> sb.append(
'N'
)
'\u00F1', '\u0144', '\u0146', '\u0148', '\u0149', '\u014B', '\u019E', '\u01F9', '\u0235', '\u0272', '\u0273', '\u1D70', '\u1D87', '\u1E45', '\u1E47', '\u1E49', '\u1E4B', '\u207F', '\u24DD', '\uFF4E' -> sb.append(
'n'
)
'\u01CA' -> {
sb.append('N')
sb.append('J')
}
'\u01CB' -> {
sb.append('N')
sb.append('j')
}
'\u24A9' -> {
sb.append('(')
sb.append('n')
sb.append(')')
}
'\u01CC' -> {
sb.append('n')
sb.append('j')
}
'\u00D2', '\u00D3', '\u00D4', '\u00D5', '\u00D6', '\u00D8', '\u014C', '\u014E', '\u0150', '\u0186', '\u019F', '\u01A0', '\u01D1', '\u01EA', '\u01EC', '\u01FE', '\u020C', '\u020E', '\u022A', '\u022C', '\u022E', '\u0230', '\u1D0F', '\u1D10', '\u1E4C', '\u1E4E', '\u1E50', '\u1E52', '\u1ECC', '\u1ECE', '\u1ED0', '\u1ED2', '\u1ED4', '\u1ED6', '\u1ED8', '\u1EDA', '\u1EDC', '\u1EDE', '\u1EE0', '\u1EE2', '\u24C4', '\uA74A', '\uA74C', '\uFF2F' -> sb.append(
'O'
)
'\u00F2', '\u00F3', '\u00F4', '\u00F5', '\u00F6', '\u00F8', '\u014D', '\u014F', '\u0151', '\u01A1', '\u01D2', '\u01EB', '\u01ED', '\u01FF', '\u020D', '\u020F', '\u022B', '\u022D', '\u022F', '\u0231', '\u0254', '\u0275', '\u1D16', '\u1D17', '\u1D97', '\u1E4D', '\u1E4F', '\u1E51', '\u1E53', '\u1ECD', '\u1ECF', '\u1ED1', '\u1ED3', '\u1ED5', '\u1ED7', '\u1ED9', '\u1EDB', '\u1EDD', '\u1EDF', '\u1EE1', '\u1EE3', '\u2092', '\u24DE', '\u2C7A', '\uA74B', '\uA74D', '\uFF4F' -> sb.append(
'o'
)
'\u0152', '\u0276' -> {
sb.append('O')
sb.append('E')
}
'\uA74E' -> {
sb.append('O')
sb.append('O')
}
'\u0222', '\u1D15' -> {
sb.append('O')
sb.append('U')
}
'\u24AA' -> {
sb.append('(')
sb.append('o')
sb.append(')')
}
'\u0153', '\u1D14' -> {
sb.append('o')
sb.append('e')
}
'\uA74F' -> {
sb.append('o')
sb.append('o')
}
'\u0223' -> {
sb.append('o')
sb.append('u')
}
'\u01A4', '\u1D18', '\u1E54', '\u1E56', '\u24C5', '\u2C63', '\uA750', '\uA752', '\uA754', '\uFF30' -> sb.append(
'P'
)
'\u01A5', '\u1D71', '\u1D7D', '\u1D88', '\u1E55', '\u1E57', '\u24DF', '\uA751', '\uA753', '\uA755', '\uA7FC', '\uFF50' -> sb.append(
'p'
)
'\u24AB' -> {
sb.append('(')
sb.append('p')
sb.append(')')
}
'\u024A', '\u24C6', '\uA756', '\uA758', '\uFF31' -> sb.append('Q')
'\u0138', '\u024B', '\u02A0', '\u24E0', '\uA757', '\uA759', '\uFF51' -> sb.append('q')
'\u24AC' -> {
sb.append('(')
sb.append('q')
sb.append(')')
}
'\u0239' -> {
sb.append('q')
sb.append('p')
}
'\u0154', '\u0156', '\u0158', '\u0210', '\u0212', '\u024C', '\u0280', '\u0281', '\u1D19', '\u1D1A', '\u1E58', '\u1E5A', '\u1E5C', '\u1E5E', '\u24C7', '\u2C64', '\uA75A', '\uA782', '\uFF32' -> sb.append(
'R'
)
'\u0155', '\u0157', '\u0159', '\u0211', '\u0213', '\u024D', '\u027C', '\u027D', '\u027E', '\u027F', '\u1D63', '\u1D72', '\u1D73', '\u1D89', '\u1E59', '\u1E5B', '\u1E5D', '\u1E5F', '\u24E1', '\uA75B', '\uA783', '\uFF52' -> sb.append(
'r'
)
'\u24AD' -> {
sb.append('(')
sb.append('r')
sb.append(')')
}
'\u015A', '\u015C', '\u015E', '\u0160', '\u0218', '\u1E60', '\u1E62', '\u1E64', '\u1E66', '\u1E68', '\u24C8', '\uA731', '\uA785', '\uFF33' -> sb.append(
'S'
)
'\u015B', '\u015D', '\u015F', '\u0161', '\u017F', '\u0219', '\u023F', '\u0282', '\u1D74', '\u1D8A', '\u1E61', '\u1E63', '\u1E65', '\u1E67', '\u1E69', '\u1E9C', '\u1E9D', '\u24E2', '\uA784', '\uFF53' -> sb.append(
's'
)
'\u1E9E' -> {
sb.append('S')
sb.append('S')
}
'\u24AE' -> {
sb.append('(')
sb.append('s')
sb.append(')')
}
'\u00DF' -> {
sb.append('s')