Commit 6a133eb8 authored by Günter Hipler's avatar Günter Hipler
Browse files

first creation of an elastic search object

parent 0ad2e276
Pipeline #22456 passed with stages
in 6 minutes and 48 seconds
......@@ -15,6 +15,7 @@ curl -XPUT "http://localhost:8080/oai-v1" -H 'Content-Type: application/json' -d
"format": {"type": "keyword"},
"published": {"type": "boolean"},
"recordset": {"type": "keyword"},
"institution": {"type": "keyword"},
"lastUpdatedDate": {
"type": "keyword",
"fields": {
......
......@@ -17,7 +17,11 @@ could be used for oai set purposes
indicator if document should be delivered (harvested) by the OAI API
### recordset
the data the document is part of - don't know if we use this for the API at the moment
the data the document is part of - (Bestand) don't know if we use this for the API at the moment
### institution
institution the data belongs to
### lastUpdatedDate
timestamp necessary for harvesting (from | until)
......
......@@ -16,4 +16,5 @@ Export des topic
kafkacat -C -b mb-ka1:9092 -t fedora-output-json-records -K '\t' -o beginning > fedora-output.json
Import des topic
docker run -v /home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/data:/data -it --network host edenhill/kafkacat:1.6.0 -P -b VPN:9092 -t fedora-output-json-records -K '\t' -l /data/fedora-output.json
\ No newline at end of file
docker run -v /home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/data:/data -it --network host edenhill/kafkacat:1.6.0 -P -b VPN:9092 -t fedora-output-json-records -K '\t' -l /data/fedora-output.json
......@@ -109,9 +109,13 @@ class KafkaTopology extends Logging {
stream
.map((k, v) =>
(
s"https://memobase.ch/record/$k",
//todo: define OAI ID
//s"https://memobase.ch/record/$k",
k,
ReportingObject(
s"https://memobase.ch/record/$k",
//todo: define OAI ID
//s"https://memobase.ch/record/$k",
k,
ProcessingWarning,
v.get.warnings.mkString("\n")
).toString
......@@ -126,9 +130,11 @@ class KafkaTopology extends Logging {
stream
.map((k, _) =>
(
s"https://memobase.ch/record/$k",
//s"https://memobase.ch/record/$k",
k,
ReportingObject(
s"https://memobase.ch/record/$k",
//s"https://memobase.ch/record/$k",
k,
ProcessingSuccess,
"EDM document for Europeana successfully created"
).toString
......@@ -143,9 +149,11 @@ class KafkaTopology extends Logging {
stream
.map((k, v) =>
(
s"https://memobase.ch/record/$k",
//s"https://memobase.ch/record/$k",
k,
ReportingObject(
s"https://memobase.ch/record/$k",
//s"https://memobase.ch/record/$k",
k,
ProcessingFatal,
s"Error creating EDM document: ${v.failed.get.getMessage}"
).toString
......@@ -161,9 +169,11 @@ class KafkaTopology extends Logging {
stream
.map((k, _) =>
(
s"https://memobase.ch/record/$k",
//s"https://memobase.ch/record/$k",
k,
ReportingObject(
s"https://memobase.ch/record/$k",
//s"https://memobase.ch/record/$k",
k,
ProcessingIgnore,
message
).toString
......
......@@ -52,7 +52,7 @@ case class ReportingObject(
override def toString: String =
ujson.write(
ujson.Obj(
("step", "iiif-manifest-creator"),
("step", "rico-edm-transformer"),
("timestamp", createTimestamp),
("id", id),
("status", status.value),
......
......@@ -21,26 +21,22 @@
package ch.memobase.edm
import ch.memobase.rdf.Helper
import ch.memobase.rdf.vocabularies.EDM
import org.eclipse.rdf4j.model.{IRI, Model, Resource}
import org.eclipse.rdf4j.model.impl.{DynamicModelFactory, SimpleValueFactory}
import org.eclipse.rdf4j.model.util.Values.iri
import org.eclipse.rdf4j.model.vocabulary.{DC, DCTERMS, RDF}
import ch.memobase.edm.subjects.{ModelXMLTransformer, ProvidedCHO, WebResource}
import java.time.format.DateTimeFormatter
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.util.Try
import ujson.{Arr => JArr, Obj => JObj}
import java.util
import scala.jdk.CollectionConverters._
class EDM {
private val edmRdfModel = new EDMRdf4jModel
private val dateTimeFormatter = DateTimeFormatter.ISO_DATE
//quite a lot of examples with datetimeformatter
//https://www.dariawan.com/tutorials/java/java-datetimeformatter-tutorial-examples/
//decide which is more appropriate for OAI
//private val dateTimeFormatter = DateTimeFormatter.ISO_DATE
private val dateTimeFormatter = DateTimeFormatter.ISO_INSTANT
def create(messageValue: String): Try[ExtractionResult[(String, String)]] = {
Try {
......@@ -48,22 +44,25 @@ class EDM {
val graph = Extractors.jsonGraph(messageValue).get.arr
val digitalObject = Extractors.digitalObject(graph).get
val record = Extractors.record(graph).get
val recordId = Extractors.recordId(record).get
val choExtraction = createChoObject(graph,record,digitalObject)
val webExtraction = createWebResources(graph,record,digitalObject)
webExtraction.obj.foreach(webResource =>
choExtraction.obj.getModel.addAll(webResource.getModel)
)
val esObject = ModelXMLTransformer(model = choExtraction.obj.getModel,
id = recordId,
recordset = "ati-002",
institution = "ati")
//edmRdfModel.serialize(record.toString)
//println(esObject)
val result = ExtractionResult((
record.toString(),
"Test"
recordId,
esObject
)
, new ArrayBuffer[String]())
......@@ -87,12 +86,8 @@ class EDM {
private def createWebResources(graph: JArr, record: JObj, digitalObject: JObj): ExtractionResult[List[WebResource]] = {
//how many digital objects are part of a document??
val webresource: Option[WebResource] = Extractors.dobjectId(digitalObject).map(new WebResource(_))
val listWebresources = if (webresource.isDefined) {
//todo: add additional properties - to be done
println()
} else {List[WebResource]()}
ExtractionResult( List[WebResource]())
val webresource: Option[List[WebResource]] = Extractors.dobjectId(digitalObject).map(id => List[WebResource](new WebResource(id)))
ExtractionResult(webresource.getOrElse(List()) )
}
......@@ -101,40 +96,4 @@ class EDM {
case class ExtractionResult[T](obj: T, warnings: mutable.Buffer[String] = mutable.Buffer())
class ProvidedCHO (val id: String) {
import org.eclipse.rdf4j.model.util.Values.iri
//import org.eclipse.rdf4j.model.util.Values.literal
private val model = Helper.getModelWithEDMNamespaces
model.add(iri(id),RDF.TYPE, EDM.ProvidedCHO)
private val factory = SimpleValueFactory.getInstance()
def addDescription(desc: Option[String]): Unit =
desc.map(t => model.add(iri(id),DC.DESCRIPTION,factory.createLiteral(t)))
def addTitel(title: Option[String]): Unit = {
title.map(t => model.add(iri(id),DC.TITLE,factory.createLiteral(t)))
}
def addCreationDate(createDate:Option[String]): Unit =
createDate.map(t => model.add(iri(id),DC.DATE,factory.createLiteral(t)))
def getModel: Model = model
}
class WebResource (val id: String) {
private val model = Helper.getModelWithEDMNamespaces
model.add(iri(id),RDF.TYPE, EDM.WebResource)
def getModel: Model = model
}
class Aggregation
......@@ -20,6 +20,7 @@
package ch.memobase.edm
import ch.memobase.edm.subjects.ProvidedCHO
import org.eclipse.rdf4j.model.Model
import org.eclipse.rdf4j.model.util.ModelBuilder
import org.eclipse.rdf4j.rio.{RDFFormat, Rio}
......@@ -32,7 +33,7 @@ class EDMRdf4jModel {
def serializeModel(providedCho: ProvidedCHO,
lang:RDFFormat = RDFFormat.RDFXML): String = {
lang:RDFFormat = RDFFormat.RDFXML): String = {
val sw = new StringWriter
Rio.write(edmModel, sw,RDFFormat.RDFXML)
......
/*
* rico2edm
* Copyright (C) 2021 UB Basel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
package ch.memobase.edm.subjects
import ch.memobase.rdf.writer.RdfXmlWriter
import org.eclipse.rdf4j.model.Model
import org.eclipse.rdf4j.rio.Rio
import java.io.StringWriter
import java.time.ZonedDateTime
import java.time.format.DateTimeFormatter
object ModelXMLTransformer {
private val prologPattern = "<\\?xml.*?\\?>".r
private val dateTimeFormatter = DateTimeFormatter.ISO_INSTANT
def apply(
model: Model,
id: String,
recordset: String,
institution: String,
published: Boolean = true,
format: String = "EDM",
): String = {
val sOut = new StringWriter
val rdfWriter = new RdfXmlWriter(sOut)
Rio.write(model, rdfWriter)
//create whole ES structure and replace XML prolog
//what about OAI header - I think we can create it here and should relief the OAI server from this task
prologPattern.replaceFirstIn(ujson.Obj(
//is this the correct ID
"id" -> id,
"document" -> sOut.toString,
"format" -> format,
//we need specific rules to decide which documents are going to be published
//or we have to filter them out
"published" -> published,
//for the recordset and institution evaluate the information sent by Silvia
//by now fixed values
"recordset" -> recordset,
"institution" -> institution,
"lastUpdatedDate" -> dateTimeFormatter.format(ZonedDateTime.now).toString
).toString,"")
}
}
/*
* rico2edm
* Copyright (C) 2021 UB Basel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
package ch.memobase.edm.subjects
import ch.memobase.rdf.Helper
import ch.memobase.rdf.vocabularies.EDM
import org.eclipse.rdf4j.model.Model
import org.eclipse.rdf4j.model.impl.SimpleValueFactory
import org.eclipse.rdf4j.model.util.Values.iri
import org.eclipse.rdf4j.model.vocabulary.{DC, RDF}
class ProvidedCHO (val id: String) {
import org.eclipse.rdf4j.model.util.Values.iri
//import org.eclipse.rdf4j.model.util.Values.literal
private val model = Helper.getModelWithEDMNamespaces
model.add(iri(id),RDF.TYPE, EDM.ProvidedCHO)
private val factory = SimpleValueFactory.getInstance()
def addDescription(desc: Option[String]): Unit =
desc.map(t => model.add(iri(id),DC.DESCRIPTION,factory.createLiteral(t)))
def addTitel(title: Option[String]): Unit = {
title.map(t => model.add(iri(id),DC.TITLE,factory.createLiteral(t)))
}
def addCreationDate(createDate:Option[String]): Unit =
createDate.map(t => model.add(iri(id),DC.DATE,factory.createLiteral(t)))
def getModel: Model = model
}
class WebResource (val id: String) {
private val model = Helper.getModelWithEDMNamespaces
model.add(iri(id),RDF.TYPE, EDM.WebResource)
def getModel: Model = model
}
class Aggregation
{
"@graph": [
{
"@id": "_:b0",
"@type": "https://www.ica.org/standards/RiC/ontology#Identifier",
"identifier": "30466/01-03",
"type": "callNumber"
},
{
"@id": "_:b1",
"@type": "https://www.ica.org/standards/RiC/ontology#Title",
"title": "Montagsstudio",
"type": "broadcast"
},
{
"@id": "_:b10",
"@type": "https://www.ica.org/standards/RiC/ontology#Activity",
"affects": "_:b17",
"beginningDate": "2021-36-22T00:36:47+0000",
"endDate": "2021-36-22T00:36:47+0000",
"performedBy": "_:b9",
"resultsIn": "_:b18",
"type": "enrichment"
},
{
"@id": "_:b11",
"@type": "https://www.ica.org/standards/RiC/ontology#SingleDate",
"normalizedDateValue": "1972-10-10"
},
{
"@id": "_:b12",
"@type": "https://www.ica.org/standards/RiC/ontology#Place",
"name": "BS"
},
{
"@id": "_:b13",
"@type": "https://www.ica.org/standards/RiC/ontology#Activity",
"affects": "_:b15",
"beginningDate": "2021-36-22T00:36:47+0000",
"endDate": "2021-36-22T00:36:47+0000",
"performedBy": "_:b14",
"resultsIn": "_:b16",
"type": "enrichment"
},
{
"@id": "_:b14",
"@type": "https://www.ica.org/standards/RiC/ontology#Mechanism",
"name": "GenreNormalizer",
"performs": "_:b13"
},
{
"@id": "_:b15",
"@type": "http://www.w3.org/2004/02/skos/core#Concept",
"prefLabel": "Hörspiel"
},
{
"@id": "_:b16",
"@type": "http://www.w3.org/2004/02/skos/core#Concept",
"prefLabel": [
{
"@language": "de",
"@value": "Hörspiel"
},
{
"@language": "fr",
"@value": "Pièce radiophonique"
},
{
"@language": "it",
"@value": "radio gioco"
}
],
"resultsFrom": "_:b13"
},
{
"@id": "_:b17",
"@type": "https://www.ica.org/standards/RiC/ontology#CarrierType",
"name": "1/4 Zoll Magnetband"
},
{
"@id": "_:b18",
"@type": "https://www.ica.org/standards/RiC/ontology#CarrierType",
"sameAs": "http://www.wikidata.org/entity/Q61996834",
"name": [
{
"@language": "de",
"@value": "FEHLENDES LABEL"
},
{
"@language": "fr",
"@value": "L'ÉTIQUETTE MANQUANTE"
},
{
"@language": "it",
"@value": "GALATEO MANCANTE"
}
],
"resultsFrom": "_:b10"
},
{
"@id": "_:b19",
"@type": "https://www.ica.org/standards/RiC/ontology#Rule",
"name": "via SRG SSR klären",
"regulates": "https://memobase.ch/record/srf-016-BS_MG_30466_K01",
"type": "holder"
},
{
"@id": "_:b2",
"@type": "https://www.ica.org/standards/RiC/ontology#Mechanism",
"name": "LanguagesNormalizer",
"performs": "_:b3"
},
{
"@id": "_:b20",
"@type": "https://www.ica.org/standards/RiC/ontology#Identifier",
"identifier": "srf-016-BS_MG_30466_K01-1",
"type": "main"
},
{
"@id": "_:b21",
"@type": "https://www.ica.org/standards/RiC/ontology#Identifier",
"identifier": "srf-016-BS_MG_30466_K01",
"type": "main"
},
{
"@id": "_:b22",
"@type": "https://www.ica.org/standards/RiC/ontology#Language",
"sameAs": "http://www.wikidata.org/entity/Q387066",
"name": [
{
"@language": "de",
"@value": "Schweizerdeutsch"
},
{
"@language": "fr",
"@value": "suisse allemand"
},
{
"@language": "it",
"@value": "svizzero tedesco"
}
],
"resultsFrom": "_:b23",
"type": "content"
},
{
"@id": "_:b23",
"@type": "https://www.ica.org/standards/RiC/ontology#Activity",
"affects": "_:b5",
"beginningDate": "2021-36-22T00:36:47+0000",
"endDate": "2021-36-22T00:36:47+0000",
"performedBy": "_:b25",
"resultsIn": "_:b22",
"type": "enrichment"
},
{
"@id": "_:b24",
"@type": "https://www.ica.org/standards/RiC/ontology#Rule",
"name": "onsite",
"regulates": "https://memobase.ch/physical/srf-016-BS_MG_30466_K01-1",
"type": "access"
},
{
"@id": "_:b25",
"@type": "https://www.ica.org/standards/RiC/ontology#Mechanism",
"name": "LanguagesNormalizer",
"performs": "_:b23"
},
{
"@id": "_:b26",
"@type": "https://www.ica.org/standards/RiC/ontology#SingleDate",
"normalizedDateValue": "1972-11-06"
},
{
"@id": "_:b27",
"@type": "https://www.ica.org/standards/RiC/ontology#Identifier",
"identifier": "BS_MG_30466_K01",
"type": "original"
},
{
"@id": "_:b28",
"@type": "https://www.ica.org/standards/RiC/ontology#Title",
"title": "\"T' Innkwisizioon oder Die Inquisition?\" Hörspiel und Geduchte je in hochdeutscher und in Dialektfassung. Diskussion über das Thema Dialekt oder hochdeutsche Fassung - Teil 1: Hörspiel Dialektfassung",
"type": "main"
},
{
"@id": "_:b3",
"@type": "https://www.ica.org/standards/RiC/ontology#Activity",
"affects": "_:b5",
"beginningDate": "2021-36-22T00:36:47+0000",
"endDate": "2021-36-22T00:36:47+0000",
"performedBy": "_:b2",
"resultsIn": "_:b6",
"type": "enrichment"
},
{
"@id": "_:b4",
"@type": "https://www.ica.org/standards/RiC/ontology#Identifier",
"identifier": "SRF-BS_MG_30466_K01",
"type": "oldMemobase"
},
{
"@id": "_:b5",
"@type": "https://www.ica.org/standards/RiC/ontology#Language",
"name": "schw",
"type": "content"
},
{
"@id": "_:b6",
"@type": "https://www.ica.org/standards/RiC/ontology#Language",
"name": [
{
"@language": "de",
"@value": "FEHLENDES LABEL"
},
{
"@language": "fr",
"@value": "L'ÉTIQUETTE MANQUANTE"
},
{
"@language": "it",
"@value": "GALATEO MANCANTE"
}
],
"resultsFrom": "_:b3",
"type": "content"
},