Due to a scheduled upgrade to version 14.10, GitLab will be unavailabe on Monday 30.05., from 19:00 until 20:00.

Commit 7c83c4c8 authored by Günter Hipler's avatar Günter Hipler
Browse files

implemenented iso language codes

parent cf47141c
Pipeline #24346 passed with stages
in 7 minutes and 16 seconds
s,o
http://www.wikidata.org/entity/Q27683,ace http://www.wikidata.org/entity/Q27683,ace
http://www.wikidata.org/entity/Q27776,ady http://www.wikidata.org/entity/Q27776,ady
http://www.wikidata.org/entity/Q42365,ang http://www.wikidata.org/entity/Q42365,ang
......
...@@ -16,7 +16,7 @@ Export des topic ...@@ -16,7 +16,7 @@ Export des topic
kafkacat -C -b mb-ka1:9092 -t fedora-output-json-records -K '\t' -o beginning | gzip > fedora-output1.json.gz kafkacat -C -b mb-ka1:9092 -t fedora-output-json-records -K '\t' -o beginning | gzip > fedora-output1.json.gz
Import des topic Import des topic
docker run --rm -v /home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/data:/data -it --network host edenhill/kafkacat:1.6.0 -P -b VPN:9092 -t fedora-output-json-records -K '\t' -l /data/fedora-output.json docker run --rm -v /home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/data:/data -it --network host edenhill/kafkacat:1.6.0 -P -b VPN:9092 -t fedora-output-json-records -K '\t' -l /data/fedora-output1.json
......
app: app:
institutionTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/institution_types/labels.csv" #institutionTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/institution_types/labels.csv"
documentTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/document_types/labels.csv" #documentTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/document_types/labels.csv"
accessTermLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/access_terms/labels.csv" #accessTermLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/access_terms/labels.csv"
reuseStatementLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/reuse_statements/labels.csv" #reuseStatementLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/reuse_statements/labels.csv"
isocodemapping: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/isocode-693-mapping/labels.csv" isocodemapping: ${ISOCODE_MAPPING:?system}
......
...@@ -20,13 +20,13 @@ ...@@ -20,13 +20,13 @@
package ch.memobase.rico2edm package ch.memobase.rico2edm
import ch.memobase.rico2edm.utils.Helper
import org.apache.kafka.streams.KafkaStreams import org.apache.kafka.streams.KafkaStreams
import org.apache.logging.log4j.scala.Logging import org.apache.logging.log4j.scala.Logging
import org.memobase.settings.SettingsLoader import org.memobase.settings.SettingsLoader
import java.time.Duration import java.time.Duration
import scala.util.{Failure, Success, Try} import scala.util.{Failure, Success, Try}
import scala.jdk.CollectionConverters._ import scala.jdk.CollectionConverters._
...@@ -60,6 +60,8 @@ object Main extends Logging { ...@@ -60,6 +60,8 @@ object Main extends Logging {
) )
val shutdownGracePeriodMs = 10000 val shutdownGracePeriodMs = 10000
Helper.initEnrichementMapping(settings.getAppSettings)
logger.trace("Starting stream processing") logger.trace("Starting stream processing")
Try( Try(
streams.start() streams.start()
......
...@@ -23,6 +23,7 @@ package ch.memobase.rico2edm.edm ...@@ -23,6 +23,7 @@ package ch.memobase.rico2edm.edm
import ch.memobase.rico2edm.edm import ch.memobase.rico2edm.edm
import ch.memobase.rico2edm.edm.subjects.{Aggregation, Concept, ModelXMLTransformer, Place, ProvidedCHO, TimeSpan, WebResource} import ch.memobase.rico2edm.edm.subjects.{Aggregation, Concept, ModelXMLTransformer, Place, ProvidedCHO, TimeSpan, WebResource}
import ch.memobase.rico2edm.utils.Helper
import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatter
import scala.collection.mutable import scala.collection.mutable
...@@ -129,7 +130,7 @@ class EDM { ...@@ -129,7 +130,7 @@ class EDM {
Extractors Extractors
.resourceAllLanguages(graph)(record.value) .resourceAllLanguages(graph)(record.value)
.foreach(c => cho.addLanguage(Some(c))) .foreach(c => cho.addLanguage(Helper.getLanguageCode(c)))
Extractors Extractors
.publishedByGH(graph)(record.value) .publishedByGH(graph)(record.value)
......
...@@ -59,6 +59,7 @@ object ModelXMLTransformer { ...@@ -59,6 +59,7 @@ object ModelXMLTransformer {
//is this the correct ID //is this the correct ID
"id" -> id, "id" -> id,
"document" -> Base64.getEncoder.encodeToString(Helper.compress(sOut.toString.getBytes)), "document" -> Base64.getEncoder.encodeToString(Helper.compress(sOut.toString.getBytes)),
//"document" -> sOut.toString,
"format" -> format, "format" -> format,
//we need specific rules to decide which documents are going to be published //we need specific rules to decide which documents are going to be published
//or we have to filter them out //or we have to filter them out
......
...@@ -23,9 +23,13 @@ package ch.memobase.rico2edm.utils ...@@ -23,9 +23,13 @@ package ch.memobase.rico2edm.utils
import java.io.ByteArrayOutputStream import java.io.ByteArrayOutputStream
import java.util.zip.Deflater import java.util.zip.Deflater
import java.util.{Properties, HashMap => JHashMap}
import scala.language.reflectiveCalls
object Helper { object Helper {
private var isoLanguageCodes: Option[JHashMap[String,String]] = None
def compress(data: Array[Byte]): Array[Byte] = { def compress(data: Array[Byte]): Array[Byte] = {
val deflater = new Deflater() val deflater = new Deflater()
deflater.setInput(data) deflater.setInput(data)
...@@ -46,6 +50,36 @@ object Helper { ...@@ -46,6 +50,36 @@ object Helper {
} }
def initEnrichementMapping(props: Properties):Unit = {
val isoCodes = new JHashMap[String,String]()
using(io.Source.fromFile(props.get(Keys.LANGUAGE_ISO_CODE).toString)) { source =>
for (line <- source.getLines) {
val temp = line.split(",").map(_.trim)
isoCodes.put(temp(0),temp(1))
}
}
isoLanguageCodes = Some(isoCodes)
}
//noinspection ScalaStyle
private def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
try {
f(resource)
} finally {
resource.close()
}
def getLanguageCode(wikiId:String):Option[String] = {
isoLanguageCodes match {
case Some(langkeys) if langkeys.containsKey(wikiId) => Some(langkeys.get(wikiId))
case Some(langkeys) if !langkeys.containsKey(wikiId) => None
case None => None
}
}
} }
/*
* rico2edm
* Copyright (C) 2021 UB Basel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
package ch.memobase.rico2edm.utils
object Keys {
val LANGUAGE_ISO_CODE = "isocodemapping"
}
http://www.wikidata.org/entity/Q27683,ace
http://www.wikidata.org/entity/Q27776,ady
http://www.wikidata.org/entity/Q42365,ang
/*
* rico2edm
* Copyright (C) 2021 UB Basel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
package ch.memobase.rico2edm.edm
import ch.memobase.rico2edm.utils.{Helper, Keys}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers
import java.util.Properties
class HelperSpec extends AnyFunSuite with Matchers {
//has to run as first test
test ("no initialization of mapping") {
assert(Helper.getLanguageCode("http://www.wikidata.org/entity/Q27683").isEmpty)
}
test ("load language iso codes") {
val props = new Properties()
props.put(Keys.LANGUAGE_ISO_CODE,"src/test/resources/enrichement/few-language-codes.csv")
Helper.initEnrichementMapping(props)
assert(Helper.getLanguageCode("http://www.wikidata.org/entity/Q27683").get == "ace")
//id is not available
assert(Helper.getLanguageCode("http://www.wikidata.org/entity/Q2768").isEmpty)
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment