Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
memoriav
Memobase 2020
services
postprocessing
rico-edm-transformer
Commits
7c83c4c8
Commit
7c83c4c8
authored
Apr 08, 2021
by
Günter Hipler
Browse files
implemenented iso language codes
parent
cf47141c
Pipeline
#24346
passed with stages
in 7 minutes and 16 seconds
Changes
10
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
configs/isocode-693-mapping/labels.csv
View file @
7c83c4c8
s,o
http://www.wikidata.org/entity/Q27683,ace
http://www.wikidata.org/entity/Q27776,ady
http://www.wikidata.org/entity/Q42365,ang
...
...
gh/kafkacatnotes.txt
View file @
7c83c4c8
...
...
@@ -16,7 +16,7 @@ Export des topic
kafkacat -C -b mb-ka1:9092 -t fedora-output-json-records -K '\t' -o beginning | gzip > fedora-output1.json.gz
Import des topic
docker run --rm -v /home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/data:/data -it --network host edenhill/kafkacat:1.6.0 -P -b VPN:9092 -t fedora-output-json-records -K '\t' -l /data/fedora-output.json
docker run --rm -v /home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/data:/data -it --network host edenhill/kafkacat:1.6.0 -P -b VPN:9092 -t fedora-output-json-records -K '\t' -l /data/fedora-output
1
.json
...
...
src/main/resources/app.yml
View file @
7c83c4c8
app
:
institutionTypeLabelsPath
:
"
/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/institution_types/labels.csv"
documentTypeLabelsPath
:
"
/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/document_types/labels.csv"
accessTermLabelsPath
:
"
/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/access_terms/labels.csv"
reuseStatementLabelsPath
:
"
/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/reuse_statements/labels.csv"
isocodemapping
:
"
/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/isocode-693-mapping/labels.csv"
#
institutionTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/institution_types/labels.csv"
#
documentTypeLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/document_types/labels.csv"
#
accessTermLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/access_terms/labels.csv"
#
reuseStatementLabelsPath: "/home/swissbib/environment/code/repositories/memoriav/gitlab/services/postprocessing/rico-edm-transformer/configs/reuse_statements/labels.csv"
isocodemapping
:
${ISOCODE_MAPPING:?system}
...
...
src/main/scala/ch/memobase/rico2edm/Main.scala
View file @
7c83c4c8
...
...
@@ -20,13 +20,13 @@
package
ch.memobase.rico2edm
import
ch.memobase.rico2edm.utils.Helper
import
org.apache.kafka.streams.KafkaStreams
import
org.apache.logging.log4j.scala.Logging
import
org.memobase.settings.SettingsLoader
import
java.time.Duration
import
scala.util.
{
Failure
,
Success
,
Try
}
import
scala.jdk.CollectionConverters._
...
...
@@ -60,6 +60,8 @@ object Main extends Logging {
)
val
shutdownGracePeriodMs
=
10000
Helper
.
initEnrichementMapping
(
settings
.
getAppSettings
)
logger
.
trace
(
"Starting stream processing"
)
Try
(
streams
.
start
()
...
...
src/main/scala/ch/memobase/rico2edm/edm/EDM.scala
View file @
7c83c4c8
...
...
@@ -23,6 +23,7 @@ package ch.memobase.rico2edm.edm
import
ch.memobase.rico2edm.edm
import
ch.memobase.rico2edm.edm.subjects.
{
Aggregation
,
Concept
,
ModelXMLTransformer
,
Place
,
ProvidedCHO
,
TimeSpan
,
WebResource
}
import
ch.memobase.rico2edm.utils.Helper
import
java.time.format.DateTimeFormatter
import
scala.collection.mutable
...
...
@@ -129,7 +130,7 @@ class EDM {
Extractors
.
resourceAllLanguages
(
graph
)(
record
.
value
)
.
foreach
(
c
=>
cho
.
addLanguage
(
Som
e
(
c
)))
.
foreach
(
c
=>
cho
.
addLanguage
(
Helper
.
getLanguageCod
e
(
c
)))
Extractors
.
publishedByGH
(
graph
)(
record
.
value
)
...
...
src/main/scala/ch/memobase/rico2edm/edm/subjects/ModelXMLTransformer.scala
View file @
7c83c4c8
...
...
@@ -59,6 +59,7 @@ object ModelXMLTransformer {
//is this the correct ID
"id"
->
id
,
"document"
->
Base64
.
getEncoder
.
encodeToString
(
Helper
.
compress
(
sOut
.
toString
.
getBytes
)),
//"document" -> sOut.toString,
"format"
->
format
,
//we need specific rules to decide which documents are going to be published
//or we have to filter them out
...
...
src/main/scala/ch/memobase/rico2edm/utils/Helper.scala
View file @
7c83c4c8
...
...
@@ -23,9 +23,13 @@ package ch.memobase.rico2edm.utils
import
java.io.ByteArrayOutputStream
import
java.util.zip.Deflater
import
java.util.
{
Properties
,
HashMap
=>
JHashMap
}
import
scala.language.reflectiveCalls
object
Helper
{
private
var
isoLanguageCodes
:
Option
[
JHashMap
[
String
,
String
]]
=
None
def
compress
(
data
:
Array
[
Byte
])
:
Array
[
Byte
]
=
{
val
deflater
=
new
Deflater
()
deflater
.
setInput
(
data
)
...
...
@@ -46,6 +50,36 @@ object Helper {
}
def
initEnrichementMapping
(
props
:
Properties
)
:
Unit
=
{
val
isoCodes
=
new
JHashMap
[
String
,
String
]()
using
(
io
.
Source
.
fromFile
(
props
.
get
(
Keys
.
LANGUAGE_ISO_CODE
).
toString
))
{
source
=>
for
(
line
<-
source
.
getLines
)
{
val
temp
=
line
.
split
(
","
).
map
(
_
.
trim
)
isoCodes
.
put
(
temp
(
0
),
temp
(
1
))
}
}
isoLanguageCodes
=
Some
(
isoCodes
)
}
//noinspection ScalaStyle
private
def
using
[
A
<:
{
def
close
()
:
Unit
}
,
B
](
resource
:
A
)(
f
:
A
=>
B
)
:
B
=
try
{
f
(
resource
)
}
finally
{
resource
.
close
()
}
def
getLanguageCode
(
wikiId
:
String
)
:
Option
[
String
]
=
{
isoLanguageCodes
match
{
case
Some
(
langkeys
)
if
langkeys
.
containsKey
(
wikiId
)
=>
Some
(
langkeys
.
get
(
wikiId
))
case
Some
(
langkeys
)
if
!
langkeys
.
containsKey
(
wikiId
)
=>
None
case
None
=>
None
}
}
}
src/main/scala/ch/memobase/rico2edm/utils/Keys.scala
0 → 100644
View file @
7c83c4c8
/*
* rico2edm
* Copyright (C) 2021 UB Basel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
package
ch.memobase.rico2edm.utils
object
Keys
{
val
LANGUAGE_ISO_CODE
=
"isocodemapping"
}
src/test/resources/enrichement/few-language-codes.csv
0 → 100644
View file @
7c83c4c8
http://www.wikidata.org/entity/Q27683,ace
http://www.wikidata.org/entity/Q27776,ady
http://www.wikidata.org/entity/Q42365,ang
src/test/scala/ch/memobase/rico2edm/edm/HelperSpec.scala
0 → 100644
View file @
7c83c4c8
/*
* rico2edm
* Copyright (C) 2021 UB Basel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
package
ch.memobase.rico2edm.edm
import
ch.memobase.rico2edm.utils.
{
Helper
,
Keys
}
import
org.scalatest.funsuite.AnyFunSuite
import
org.scalatest.matchers.should.Matchers
import
java.util.Properties
class
HelperSpec
extends
AnyFunSuite
with
Matchers
{
//has to run as first test
test
(
"no initialization of mapping"
)
{
assert
(
Helper
.
getLanguageCode
(
"http://www.wikidata.org/entity/Q27683"
).
isEmpty
)
}
test
(
"load language iso codes"
)
{
val
props
=
new
Properties
()
props
.
put
(
Keys
.
LANGUAGE_ISO_CODE
,
"src/test/resources/enrichement/few-language-codes.csv"
)
Helper
.
initEnrichementMapping
(
props
)
assert
(
Helper
.
getLanguageCode
(
"http://www.wikidata.org/entity/Q27683"
).
get
==
"ace"
)
//id is not available
assert
(
Helper
.
getLanguageCode
(
"http://www.wikidata.org/entity/Q2768"
).
isEmpty
)
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment