Commit 1604c2f4 authored by Günter Hipler's avatar Günter Hipler
Browse files

most of the properties should be done by now

parent a75070bd
Pipeline #24373 passed with stages
in 7 minutes and 14 seconds
aag,47.38787196, 8.049455603
abe,46.95089, 7.43706
acj,47.41728, 7.07422
adg,46.8542, 9.5382
afz,47.375219, 8.545961
agl,47.04153, 9.06705
apf,46.5199, 6.6332
atd,46.186235, 8.732027
ati,46.192846, 9.0132
avl,46.52511, 6.62441
azh,47.39077, 8.512219
baa,45.86725, 8.98326
bab,47.551869, 7.589958
bar,46.940555555, 7.446388888
baz,47.37237, 8.54582
bbb,46.9473, 7.4483
bcf,46.8066, 7.15597
bfl,46.92833333, 7.45166667
bmf,47.5567059, 7.578635
bpu,46.9902, 6.9311
bvc,47.1042, 6.82634
cde,46.19309, 9.01245
cdt,46.19908, 6.13594
cag,46.1992, 6.1376
cic,46.2274, 6.1373
clg,46.87994, 8.64185
clu,47.10016, 6.8239
csa,46.60208, 6.53612
fad,46.44019, 8.93677
fer,46.1314399, 8.802462
fgr,46.85014, 9.53323
fpc,46.194709, 9.024187
fss,47.4958, 8.7383
gvs,46.88057, 8.64469
hgk,47.53307, 7.61098
hsl,47.07143, 8.27772
ias,46.52378, 6.58423
ikg,46.85111, 9.533846
ikr,46.22169, 6.12565
kak,47.3941882, 8.0587246
kbg,46.849522222, 9.533855555
kek,47.12683, 8.75293
khz,47.370278, 8.548056
klu,47.04138, 8.310911
kmm,47.06376, 7.09299
lfg,47.11458, 8.38545
lkb,46.94049, 7.44218
lmz,47.379166666, 8.539722222
maa,47.3944147, 8.0452584
mav,46.938154, 7.394621
mcl,46.0043, 8.95327
mdl,46.38, 6.24018
meg,46.197797222, 6.137313888
mel,46.509824, 6.632767
mem,46.1736, 8.81054
mfk,46.94186, 7.45004
mgb,46.6167, 7.058596
mgz,47.39077, 8.512219
mhl,46.5221, 6.63491
mov,46.94134, 7.436
mws,46.23168, 7.35853
raf,47.50045, 8.72527
rkk,47.37968, 8.52745
rra,47.37845, 8.52948
rti,46.00373, 8.9512
rtr,46.8520381, 9.5344336
rts,46.2, 6.083333333
rxb,47.53496, 7.59477
sap,46.949135, 7.436426111
sbb,47.47540096, 8.205857926
sik,47.362485, 8.555264
snb,46.941444, 7.449667
snp,46.00605, 8.9399
son,46.94814, 7.45241
soz,47.366827777, 8.547530555
srf,47.40146, 8.53547
sts,47.6973, 8.6337185
swi,46.943487611, 7.473449888
ubb,47.5594, 7.5812
vks,47.052777777, 8.335833333
zbz,47.374166666, 8.545277777
zem,46.96684, 7.45468
\ No newline at end of file
......@@ -4,6 +4,7 @@ app:
#accessTermLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/access_terms/labels.csv"
#reuseStatementLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/reuse_statements/labels.csv"
isocodemapping: ${ISOCODE_MAPPING:?system}
institutionscoordinates: ${INSTITUTIONS_COORDINATES:?system}
elastic:
host: ${ELASTIC_HOST:?system}
port: ${ELASTIC_PORT:?system}
......
......@@ -42,6 +42,7 @@ object Main extends Logging {
"reuseStatementLabelsPath"
).asJava,*/
List(
Keys.INSTITUTIONS_COORDINATES_MAPPING,
Keys.LANGUAGE_ISO_CODE,
Keys.ELASTIC_HOST,
Keys.ELASTIC_PORT,
......@@ -68,7 +69,8 @@ object Main extends Logging {
val shutdownGracePeriodMs = 10000
Helper.initEnrichementMapping(settings.getAppSettings)
Helper.initLanguageCodeMapping(settings.getAppSettings)
Helper.initInstitutionsCoordinateMapping(settings.getAppSettings)
ElasticSearchClientWrapper(settings.getAppSettings)
logger.trace("Starting stream processing")
......
......@@ -23,7 +23,7 @@ package ch.memobase.rico2edm.edm
import ch.memobase.rico2edm.edm
import ch.memobase.rico2edm.edm.subjects.{Aggregation, Concept, ModelXMLTransformer, Place, ProvidedCHO, TimeSpan, WebResource}
import ch.memobase.rico2edm.utils.Helper
import ch.memobase.rico2edm.utils.{ElasticSearchClientWrapper, Helper}
import java.time.format.DateTimeFormatter
import scala.collection.mutable
......@@ -72,9 +72,9 @@ class EDM {
val esObject = ModelXMLTransformer(model = choExtraction.obj.getModel,
id = shortRecordId,
recordset = Extractors.recordSetOrInstitution(record)("isPartOf")
recordset = Extractors.recordSetOrInstitution(record.get)("isPartOf")
.map( ident => EDM.getInstitutionOrRecordsetIdent(ident)),
institution = Extractors.recordSetOrInstitution(record)("heldBy")
institution = Extractors.recordSetOrInstitution(record.get)("heldBy")
.map( identInstitution => EDM.getInstitutionOrRecordsetIdent(identInstitution))
)
......@@ -179,6 +179,27 @@ class EDM {
Extractors.edmType(record.value)
.foreach(c => cho.addEdmType(Some(c)))
Extractors
.recordSetOrInstitution(record.value)("heldBy")
.foreach(c => Helper.getInstitutionCoord(
EDM.getInstitutionOrRecordsetIdent(c)
).map(indexValue =>
cho.addCurrentLocation(Some(indexValue))))
Extractors
.recordSetOrInstitution(record.value)("isPartOf")
.foreach(c => ElasticSearchClientWrapper.getRecordsetName(
EDM.getInstitutionOrRecordsetIdent(c)
).map(indexValue =>
cho.addIsPartOf(Some(indexValue))))
/*
recordset = Extractors.recordSetOrInstitution(record.get)("isPartOf")
.map( ident => EDM.getInstitutionOrRecordsetIdent(ident)),
*/
ExtractionResult(cho)
......@@ -272,6 +293,14 @@ class EDM {
//fixed value for provider
aggregation.addProvider(Some("Memoriav"))
Extractors
.recordSetOrInstitution(record.value)("heldBy")
.foreach(c => ElasticSearchClientWrapper.getHeldBy(
EDM.getInstitutionOrRecordsetIdent(c)
).map(indexValue =>
aggregation.addDataProvider(Some(indexValue))))
Option(ExtractionResult(aggregation))
} else {
......
......@@ -111,13 +111,13 @@ object Extractors {
"https://www.ica.org/standards/RiC/ontology#Record"
)
private val isHttpIdentifier = "^http.*".r
val recordSetOrInstitution: Try[mutable.LinkedHashMap[String, JValue]] => String => List[String] =
val recordSetOrInstitution: mutable.LinkedHashMap[String, JValue] => String => List[String] =
record =>
property => {
val idents = if (stringValue(record.get)(property).isDefined) {
List(stringValue(record.get)(property).get)
} else if (arrayValue(record.get)(property).isDefined) {
arrayValue(record.get)(property).get.map(_.str).toList
val idents = if (stringValue(record)(property).isDefined) {
List(stringValue(record)(property).get)
} else if (arrayValue(record)(property).isDefined) {
arrayValue(record)(property).get.map(_.str).toList
} else {
List.empty[String]
}
......
......@@ -95,6 +95,13 @@ class ProvidedCHO (val id: String) {
def addEdmType(edmtype:Option[String]): Unit =
edmtype.map(t => model.add(iri(id),EDMVocab.TYPE,factory.createLiteral(t)))
def addCurrentLocation(edmCurrentLocation:Option[(String,String)]): Unit =
edmCurrentLocation.map(t => model.add(iri(id),EDMVocab.CURRENT_LOCATION,
factory.createLiteral(s"${t._1} / ${t._2}")))
def addIsPartOf(dcTermsPartOf:Option[String]): Unit =
dcTermsPartOf.map(t => model.add(iri(id),DCTERMS.IS_PART_OF,factory.createLiteral(t)))
def getModel: Model = model
......@@ -187,6 +194,12 @@ class Aggregation(private val id: String) {
}
def addDataProvider(edmDataProvider: Option[String]): Unit = {
edmDataProvider.map(a =>
model.add(iri(id),EDMVocab.DATA_PROVIDER,factory.createLiteral(a))
)
}
def addEDMObjectNoFoto(edmObjectNoFoto: Option[String]): Unit = {
edmObjectNoFoto.map(a => {
if (identValue.matches(edmObjectNoFoto.get)) {
......
......@@ -45,6 +45,8 @@ object EDMVocab extends VocabularyFactory("http://www.europeana.eu/schemas/edm/"
val IS_SHOWN_BY: IRI = getIri("isShownBy")
val OBJECT: IRI = getIri("object")
val PROVIDER: IRI = getIri("provider")
val CURRENT_LOCATION: IRI = getIri("currentLocation")
val DATA_PROVIDER: IRI = getIri("dataProvider")
}
......
......@@ -24,19 +24,57 @@ import com.typesafe.scalalogging.Logger
import org.apache.http.{Header, HttpHost}
import org.apache.http.message.BasicHeader
import org.apache.logging.log4j.scala.Logging
import org.elasticsearch.client.{RestClient, RestHighLevelClient}
import org.elasticsearch.action.get.GetRequest
import org.elasticsearch.client.{RequestOptions, RestClient, RestHighLevelClient}
import java.util
import java.util.Properties
import scala.collection.mutable.ArrayBuffer
import scala.util.{Failure, Success, Try}
import scala.jdk.CollectionConverters._
import java.util.{ArrayList => JArrayList}
import java.util.{HashMap => JHashMap}
class ElasticSearchClientWrapper private (val client: RestHighLevelClient, val indices: Map[String,String]) {
type ESStringList = JHashMap[String,JArrayList[String]]
private def getInstitutionNameById (id: String) = Try {
val getRequest = new GetRequest(indices.getOrElse(Keys.INSTITUTION_INDEX,"institutions-v4"),id)
val getResponse = client.get(getRequest, RequestOptions.DEFAULT)
if (getResponse.isSourceEmpty) {
throw new Exception(s"institution with identifier $id not found ")
} else {
val hit = getResponse.getSource.asScala
val institutionName = hit.getOrElse("name", new ESStringList()).asInstanceOf[ESStringList].asScala
institutionName.getOrElse("de",new JArrayList[String]{"unknown"}).get(0)
}
}
private def getRecordSetName (recordSetId: String) = Try {
val getRequest = new GetRequest(indices.getOrElse(Keys.RECORDSET_INDEX,"record-sets-v4"),recordSetId)
val getResponse = client.get(getRequest, RequestOptions.DEFAULT)
if (getResponse.isSourceEmpty) {
throw new Exception(s"recordset with identifier $recordSetId not found ")
} else {
val hit = getResponse.getSource.asScala
val institutionName = hit.getOrElse("name", new ESStringList()).asInstanceOf[ESStringList].asScala
institutionName.getOrElse("de",new JArrayList[String]{"unknown"}).get(0)
}
}
}
object ElasticSearchClientWrapper extends Logging{
private var client: Option[ElasticSearchClientWrapper] = None
private val identValue = "^https.*".r
//not really functional - but fits our needs actually in the best way
def apply(props: Properties): Boolean =
......@@ -55,6 +93,33 @@ object ElasticSearchClientWrapper extends Logging{
false
}
def getHeldBy(id: String): Option[String] = {
if (client.isDefined) {
client.get.getInstitutionNameById(id) match {
case Success(instTitle) => Some(instTitle)
case Failure(exception) =>
logger.error(s"error trying to get institution title: $exception")
None
}
} else {
Option.empty
}
}
def getRecordsetName(idRecordSet: String): Option[String] = {
if (client.isDefined) {
client.get.getRecordSetName(idRecordSet) match {
case Success(instTitle) => Some(instTitle)
case Failure(exception) =>
logger.error(s"error trying to get recordset title: $exception")
None
}
} else {
Option.empty
}
}
private def connect(props: Properties) = Try {
val hosts = new ArrayBuffer[HttpHost]
......
......@@ -30,6 +30,10 @@ object Helper {
private var isoLanguageCodes: Option[JHashMap[String,String]] = None
private var institutionsCoordinates: Option[JHashMap[String,(String,String)]] = None
def compress(data: Array[Byte]): Array[Byte] = {
val deflater = new Deflater()
deflater.setInput(data)
......@@ -50,7 +54,7 @@ object Helper {
}
def initEnrichementMapping(props: Properties):Unit = {
def initLanguageCodeMapping(props: Properties):Unit = {
val isoCodes = new JHashMap[String,String]()
......@@ -61,6 +65,21 @@ object Helper {
}
}
isoLanguageCodes = Some(isoCodes)
}
def initInstitutionsCoordinateMapping(props: Properties):Unit = {
val coord = new JHashMap[String,(String,String)]()
using(io.Source.fromFile(props.get(Keys.INSTITUTIONS_COORDINATES_MAPPING).toString)) { source =>
for (line <- source.getLines) {
val temp = line.split(",").map(_.trim)
coord.put(temp(0),(temp(1),temp(2)))
}
}
institutionsCoordinates = Some(coord)
}
//noinspection ScalaStyle
......@@ -82,7 +101,17 @@ object Helper {
}
}
def getInstitutionCoord(institutionId:String):Option[(String,String)] = {
institutionsCoordinates match {
case Some(coords) if coords.containsKey(institutionId) =>
Some(coords.get(institutionId))
case Some(coords) if !coords.containsKey(institutionId) =>
None
case None =>
None
}
}
}
......@@ -23,6 +23,7 @@ package ch.memobase.rico2edm.utils
object Keys {
val LANGUAGE_ISO_CODE = "isocodemapping"
val INSTITUTIONS_COORDINATES_MAPPING = "institutionscoordinates"
val INSTITUTION_INDEX = "elastic.institutionIndex"
val RECORDSET_INDEX = "elastic.recordSetIndex"
val ELASTIC_HOST = "elastic.host"
......
"ID","coordinates"
"aag","47.38787196, 8.049455603"
"abe","46.95089, 7.43706"
"acj","47.41728, 7.07422"
......
This diff is collapsed.
......@@ -52,6 +52,7 @@ class CHOSpec extends AnyFunSuite with Matchers{
private lazy val ricoRawContributor = loadFile("src/test/resources/raw.contributor.json")
private lazy val ricoheldByArray = loadFile("src/test/resources/rico.heldBy.array.json")
test ("create CHOObject with Identifier") {
......@@ -444,7 +445,7 @@ class CHOSpec extends AnyFunSuite with Matchers{
val record = Extractors.record(graph)
//extract institutions
val identsInstitution = Extractors.recordSetOrInstitution(record)("heldBy")
val identsInstitution = Extractors.recordSetOrInstitution(record.get)("heldBy")
assert(identsInstitution.length == 2 && identsInstitution.head == "https://memobase.ch/institution/rts" &&
identsInstitution(1) == "https://memobase.ch/institution/nurEinTest")
......@@ -454,7 +455,7 @@ class CHOSpec extends AnyFunSuite with Matchers{
//extract recordsets
//isPartOf
val identsRecordsets = Extractors.recordSetOrInstitution(record)("isPartOf")
val identsRecordsets = Extractors.recordSetOrInstitution(record.get)("isPartOf")
assert(identsRecordsets.length == 1 && identsRecordsets.head == "https://memobase.ch/recordSet/rts-002")
val kuerzelRecordsets = identsRecordsets.map( ident => EDM.getInstitutionOrRecordsetIdent(ident))
assert(kuerzelRecordsets.length == 1 && kuerzelRecordsets.head == "rts-002")
......@@ -468,6 +469,25 @@ class CHOSpec extends AnyFunSuite with Matchers{
assert(edmType.isDefined && edmType.get == "SOUND")
}
test ("record heldBy") {
val graph = Extractors.jsonGraph(ricoDuration).get.arr
val record = Extractors.record(graph)
val ok = Extractors.recordSetOrInstitution(record.get)("heldBy")
assert(ok.nonEmpty && ok.head == "https://memobase.ch/institution/raf")
//actually we expect heldBy to be a Json String and not an array
val graphHeldByArray = Extractors.jsonGraph(ricoheldByArray).get.arr
val recordHeldByArray = Extractors.record(graphHeldByArray)
val isList = Extractors.recordSetOrInstitution(recordHeldByArray.get)("heldBy")
assert(isList.length == 2)
val test = Extractors.recordSetOrInstitution(record.get)("heldBy")
.map( identInstitution => EDM.getInstitutionOrRecordsetIdent(identInstitution))
println(test)
}
}
......@@ -38,7 +38,7 @@ class HelperSpec extends AnyFunSuite with Matchers {
test ("load language iso codes") {
val props = new Properties()
props.put(Keys.LANGUAGE_ISO_CODE,"src/test/resources/enrichement/few-language-codes.csv")
Helper.initEnrichementMapping(props)
Helper.initLanguageCodeMapping(props)
assert(Helper.getLanguageCode("http://www.wikidata.org/entity/Q27683").get == "ace")
//id is not available
assert(Helper.getLanguageCode("http://www.wikidata.org/entity/Q2768").isEmpty)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment