Commit 7c83c4c8 authored by Günter Hipler's avatar Günter Hipler
Browse files

implemenented iso language codes

parent cf47141c
Pipeline #24346 passed with stages
in 7 minutes and 16 seconds
s,o
http://www.wikidata.org/entity/Q27683,ace
http://www.wikidata.org/entity/Q27776,ady
http://www.wikidata.org/entity/Q42365,ang
......
......@@ -16,7 +16,7 @@ Export des topic
kafkacat -C -b mb-ka1:9092 -t fedora-output-json-records -K '\t' -o beginning | gzip > fedora-output1.json.gz
Import des topic
docker run --rm -v /home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/data:/data -it --network host edenhill/kafkacat:1.6.0 -P -b VPN:9092 -t fedora-output-json-records -K '\t' -l /data/fedora-output.json
docker run --rm -v /home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/data:/data -it --network host edenhill/kafkacat:1.6.0 -P -b VPN:9092 -t fedora-output-json-records -K '\t' -l /data/fedora-output1.json
......
app:
institutionTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/institution_types/labels.csv"
documentTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/document_types/labels.csv"
accessTermLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/access_terms/labels.csv"
reuseStatementLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/reuse_statements/labels.csv"
isocodemapping: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/isocode-693-mapping/labels.csv"
#institutionTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/institution_types/labels.csv"
#documentTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/document_types/labels.csv"
#accessTermLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/access_terms/labels.csv"
#reuseStatementLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/reuse_statements/labels.csv"
isocodemapping: ${ISOCODE_MAPPING:?system}
......
......@@ -20,13 +20,13 @@
package ch.memobase.rico2edm
import ch.memobase.rico2edm.utils.Helper
import org.apache.kafka.streams.KafkaStreams
import org.apache.logging.log4j.scala.Logging
import org.memobase.settings.SettingsLoader
import java.time.Duration
import scala.util.{Failure, Success, Try}
import scala.jdk.CollectionConverters._
......@@ -60,6 +60,8 @@ object Main extends Logging {
)
val shutdownGracePeriodMs = 10000
Helper.initEnrichementMapping(settings.getAppSettings)
logger.trace("Starting stream processing")
Try(
streams.start()
......
......@@ -23,6 +23,7 @@ package ch.memobase.rico2edm.edm
import ch.memobase.rico2edm.edm
import ch.memobase.rico2edm.edm.subjects.{Aggregation, Concept, ModelXMLTransformer, Place, ProvidedCHO, TimeSpan, WebResource}
import ch.memobase.rico2edm.utils.Helper
import java.time.format.DateTimeFormatter
import scala.collection.mutable
......@@ -129,7 +130,7 @@ class EDM {
Extractors
.resourceAllLanguages(graph)(record.value)
.foreach(c => cho.addLanguage(Some(c)))
.foreach(c => cho.addLanguage(Helper.getLanguageCode(c)))
Extractors
.publishedByGH(graph)(record.value)
......
......@@ -59,6 +59,7 @@ object ModelXMLTransformer {
//is this the correct ID
"id" -> id,
"document" -> Base64.getEncoder.encodeToString(Helper.compress(sOut.toString.getBytes)),
//"document" -> sOut.toString,
"format" -> format,
//we need specific rules to decide which documents are going to be published
//or we have to filter them out
......
......@@ -23,9 +23,13 @@ package ch.memobase.rico2edm.utils
import java.io.ByteArrayOutputStream
import java.util.zip.Deflater
import java.util.{Properties, HashMap => JHashMap}
import scala.language.reflectiveCalls
object Helper {
private var isoLanguageCodes: Option[JHashMap[String,String]] = None
def compress(data: Array[Byte]): Array[Byte] = {
val deflater = new Deflater()
deflater.setInput(data)
......@@ -46,6 +50,36 @@ object Helper {
}
def initEnrichementMapping(props: Properties):Unit = {
val isoCodes = new JHashMap[String,String]()
using(io.Source.fromFile(props.get(Keys.LANGUAGE_ISO_CODE).toString)) { source =>
for (line <- source.getLines) {
val temp = line.split(",").map(_.trim)
isoCodes.put(temp(0),temp(1))
}
}
isoLanguageCodes = Some(isoCodes)
}
//noinspection ScalaStyle
private def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
try {
f(resource)
} finally {
resource.close()
}
def getLanguageCode(wikiId:String):Option[String] = {
isoLanguageCodes match {
case Some(langkeys) if langkeys.containsKey(wikiId) => Some(langkeys.get(wikiId))
case Some(langkeys) if !langkeys.containsKey(wikiId) => None
case None => None
}
}
}
/*
* rico2edm
* Copyright (C) 2021 UB Basel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
package ch.memobase.rico2edm.utils
object Keys {
val LANGUAGE_ISO_CODE = "isocodemapping"
}
http://www.wikidata.org/entity/Q27683,ace
http://www.wikidata.org/entity/Q27776,ady
http://www.wikidata.org/entity/Q42365,ang
/*
* rico2edm
* Copyright (C) 2021 UB Basel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
package ch.memobase.rico2edm.edm
import ch.memobase.rico2edm.utils.{Helper, Keys}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers
import java.util.Properties
class HelperSpec extends AnyFunSuite with Matchers {
//has to run as first test
test ("no initialization of mapping") {
assert(Helper.getLanguageCode("http://www.wikidata.org/entity/Q27683").isEmpty)
}
test ("load language iso codes") {
val props = new Properties()
props.put(Keys.LANGUAGE_ISO_CODE,"src/test/resources/enrichement/few-language-codes.csv")
Helper.initEnrichementMapping(props)
assert(Helper.getLanguageCode("http://www.wikidata.org/entity/Q27683").get == "ace")
//id is not available
assert(Helper.getLanguageCode("http://www.wikidata.org/entity/Q2768").isEmpty)
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment