Commit f39ea755 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Fix use of csv files for labels.

parent 850b7375
Pipeline #21608 passed with stages
in 4 minutes
......@@ -53,6 +53,8 @@ dependencies {
// JSON Parser
implementation 'com.beust:klaxon:5.2'
// CSV Reader
implementation("com.github.doyaaaaaken:kotlin-csv-jvm:0.7.3")
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'
implementation "org.jetbrains.kotlin:kotlin-script-runtime:1.3.71"
......
......@@ -5,7 +5,9 @@ metadata:
namespace: memobase
data:
APPLICATION_ID: "{{ .Values.deploymentName }}-app"
PATH_LANGUAGE_SOURCE: "{{ .Values.pathLanguageSource }}"
PATH_LANGUAGE_LABELS: "{{ .Values.pathLanguageSource }}"
PATH_CANTON_LABELS: "{{ .Values.pathCantonLabels }}"
PATH_MUNICIPALITY_LABELS: "{{ .Values.pathMunicipalityLabels }}"
TOPIC_IN: "{{ .Values.inputTopic }}"
TOPIC_OUT: "{{ .Values.outputTopic }}"
TOPIC_PROCESS: "{{ .Values.deploymentName }}-reporting"
\ No newline at end of file
......@@ -13,4 +13,4 @@ pathLanguageSource: /configs/languages/labels.csv
cantonLabels: canton-labels-csv
pathCantonLabels: /configs/cantons/labels.csv
municipalityLabels: municipality-labels-csv
pathMunicipaltiyLabels: /configs/municipalities/labels.csv
\ No newline at end of file
pathMunicipalityLabels: /configs/municipalities/labels.csv
\ No newline at end of file
......@@ -44,9 +44,9 @@ import org.memobase.model.RichText
class RdfTransformer(properties: Properties) {
private val log = LogManager.getLogger(this::class.java)
private val cantons = Util.getCantons()
private val municipalities = Util.getMunicipalities()
private val languages = Util.loadLanguages(properties.getProperty(Util.languageSourceFilePathPropertyName))
private val cantons = Util.readLabelFile(properties.getProperty(Util.pathCantons))
private val municipalities = Util.readLabelFile(properties.getProperty(Util.pathMunicipalities))
private val languages = Util.readLabelFile(properties.getProperty(Util.pathLanguages))
fun createInstitution(input: Institution): Pair<String, Model> {
val model = ModelFactory.createDefaultModel()
......
......@@ -27,7 +27,9 @@ class Service(file: String = "app.yml") {
val settings = SettingsLoader(
listOf(
Util.languageSourceFilePathPropertyName
Util.pathMunicipalities,
Util.pathCantons,
Util.pathLanguages
),
file,
useStreamsConfig = true
......
......@@ -18,8 +18,8 @@
package org.memobase
import ch.memobase.rdf.NS
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import java.io.File
import java.io.FileInputStream
import java.io.FileNotFoundException
import java.io.StringWriter
import java.time.LocalDateTime
......@@ -33,69 +33,38 @@ import org.apache.logging.log4j.LogManager
import org.memobase.model.IdLabels
object Util {
const val languageSourceFilePathPropertyName = "path.languages"
const val pathLanguages = "path.languages"
const val pathCantons = "path.cantons"
const val pathMunicipalities = "path.municipalities"
const val memoriavProject = "memoriavProject"
const val memoriavUri = NS.mbcb + "mrv"
val now: String = LocalDateTime.now().format(DateTimeFormatter.ISO_DATE_TIME)
private val log = LogManager.getLogger("DrupalSyncHelpers")
private val log = LogManager.getLogger(this::class.java)
fun getMunicipalities(): Map<String, IdLabels> {
val stream = ClassLoader.getSystemResourceAsStream("municipalities.tsv")
if (stream != null) {
return stream.bufferedReader().lineSequence().filterNot {
it.startsWith("code")
}.map {
val values = it.split("\t")
val codes = values[0].split(",").map { code -> code.trim() }
codes.flatMap { code ->
code.split("-").map { c -> c.trim() }
}.map { code ->
Pair(
code, IdLabels(
values[1].trim(),
values[2].trim(),
values[3].trim(),
values[4].trim()
)
)
private val csv = csvReader()
fun readLabelFile(path: String): Map<String, IdLabels> {
try {
val labelList = csv.readAll(File(path))
val labelsMap = mutableMapOf<String, IdLabels>()
for (row in labelList.listIterator(1)) {
val code = row[0].trim()
val id = row[1].trim()
if (code.contains(",")) {
code.split(",").map {
it.trim()
}.forEach {
labelsMap[it] = IdLabels(id, row[2].trim(), row[3].trim(), row[4].trim())
}
}.flatten().toMap()
} else {
log.error("Could not load municipalities.tsv from classpath!")
exitProcess(1)
labelsMap[code] = IdLabels(id, row[2].trim(), row[3].trim(), row[4].trim())
}
}
fun loadLanguages(path: String): Map<String, IdLabels> {
return try {
val stream = FileInputStream(File(path))
stream.bufferedReader().lineSequence().filterNot { it.startsWith("code") }
.map {
val values = it.split(",")
Pair(
values[0].trim(),
IdLabels(NS.wd + values[1].trim(), values[2].trim(), values[3].trim(), values[4].trim())
)
}.toMap()
return labelsMap
} catch (ex: FileNotFoundException) {
log.error("Could not find language labels in path $path.")
exitProcess(1)
}
}
fun getCantons(): Map<String, IdLabels> {
val stream = ClassLoader.getSystemResourceAsStream("cantons.csv")
if (stream != null) {
return stream.bufferedReader().lineSequence().filterNot {
it.startsWith("code")
}.map {
val values = it.split(",")
Pair(values[0].trim(), IdLabels(values[1].trim(), values[2].trim(), values[3].trim(), values[4].trim()))
}.toMap()
} else {
log.error("Could not load cantons.csv from classpath!")
log.error(ex.localizedMessage)
exitProcess(1)
}
}
......
app:
path:
languages: ${PATH_LANGUAGE_SOURCE:?system}
languages: ${PATH_LANGUAGE_LABELS:?system}
cantons: ${PATH_CANTON_LABELS:?system}
municipalities: ${PATH_MUNICIPALITY_LABELS:?system}
kafka:
streams:
bootstrap.servers: ${KAFKA_BOOTSTRAP_SERVERS:?system}
......
code,item,de,fr,it
VS,http://www.wikidata.org/entity/Q834,Wallis,Valais,Valais
GL,http://www.wikidata.org/entity/Q11922,Glarus,Glaris,Glaris
BE,http://www.wikidata.org/entity/Q11911,Bern,Berne,Berne
SO,http://www.wikidata.org/entity/Q11929,Solothurn,Soleure,Soleure
GE,http://www.wikidata.org/entity/Q11917,Genf,Genève,Genève
GR,http://www.wikidata.org/entity/Q11925,Graubünden,Grisons,Grisons
ZG,http://www.wikidata.org/entity/Q11933,Zug,Zoug,Zoug
ZH,http://www.wikidata.org/entity/Q11943,Zürich,Zurich,Zurich
AG,http://www.wikidata.org/entity/Q11972,Aargau,Argovie,Argovie
AR,http://www.wikidata.org/entity/Q12079,Appenzell Ausserrhoden,Appenzell Rhodes-Extérieures,Appenzell Rhodes-Extérieures
LU,http://www.wikidata.org/entity/Q12121,Luzern,Lucerne,Lucerne
BL,http://www.wikidata.org/entity/Q12146,Basel-Landschaft,Bâle-Campagne,Bâle-Campagne
AI,http://www.wikidata.org/entity/Q12094,Appenzell Innerrhoden,Appenzell Rhodes-Intérieures,Appenzell Rhodes-Intérieures
BS,http://www.wikidata.org/entity/Q12172,Basel-Stadt,Bâle-Ville,Bâle-Ville
UR,http://www.wikidata.org/entity/Q12404,Uri,Uri,Uri
NW,http://www.wikidata.org/entity/Q12592,Nidwalden,Nidwald,Nidwald
TG,http://www.wikidata.org/entity/Q12713,Thurgau,Thurgovie,Thurgovie
TI,http://www.wikidata.org/entity/Q12724,Tessin,Tessin,Tessin
FR,http://www.wikidata.org/entity/Q12640,Freiburg,Fribourg,Fribourg
OW,http://www.wikidata.org/entity/Q12573,Obwalden,Obwald,Obwald
SZ,http://www.wikidata.org/entity/Q12433,Schwyz,Schwytz,Schwytz
SG,http://www.wikidata.org/entity/Q12746,St. Gallen,Saint-Gall,Saint-Gall
SH,http://www.wikidata.org/entity/Q12697,Schaffhausen,Schaffhouse,Schaffhouse
NE,http://www.wikidata.org/entity/Q12738,Neuenburg,Neuchâtel,Neuchâtel
JU,http://www.wikidata.org/entity/Q12755,Jura,Jura,Jura
VD,http://www.wikidata.org/entity/Q12771,Waadt,Vaud,Vaud
This diff is collapsed.
package org.memobase
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.TestInstance
import org.junit.jupiter.api.assertAll
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
class TestFileReader {
private val labelResourcePath = "src/test/resources/labels"
@Test
fun `test municipalities loader`() {
val result = Util.readLabelFile("$labelResourcePath/municipalities.csv")
assertAll(
"",
{
assertThat(result)
.isNotEmpty
},
{
assertThat(result["8506"]?.id)
.isEqualTo("http://www.wikidata.org/entity/Q65761")
},
{
assertThat(result["8535"]?.id)
.isEqualTo("http://www.wikidata.org/entity/Q65761")
}
)
}
@Test
fun `test cantons loader`() {
val result = Util.readLabelFile("$labelResourcePath/cantons.csv")
assertAll(
"",
{
assertThat(result)
.hasSize(26)
}
)
}
@Test
fun `test metadata languages loader`() {
val result = Util.readLabelFile("$labelResourcePath/metadata-languages.csv")
assertAll(
"",
{
assertThat(result)
.hasSize(3)
}
)
}
}
\ No newline at end of file
package org.memobase
import org.assertj.core.api.Assertions
import org.junit.jupiter.api.Test
class TestUnit {
@Test
fun `test municipalities loader`() {
val result = Util.getMunicipalities()
Assertions.assertThat(result)
.isNotEmpty
}
}
\ No newline at end of file
......@@ -49,14 +49,14 @@ mbcb:completeExampleTest
wdt:P669 "Street" ;
wdt:P670 "Address"
] ;
rico:identifiedBy [ a rico:Identifier ;
rico:identifier "OLD_MEMOBASE_ID" ;
rico:type "oldMemobase"
] ;
rico:identifiedBy [ a rico:Identifier ;
rico:identifier "completeExampleTest" ;
rico:type "main"
] ;
rico:identifiedBy [ a rico:Identifier ;
rico:identifier "OLD_MEMOBASE_ID" ;
rico:type "oldMemobase"
] ;
rico:isHolderOf "https://memobase.ch/recordSet/testComplete" ;
rico:name "Complete Example"@de , "Complete Example"@fr , "Complete Example"@it ;
rico:type "memobaseInstitution" .
......@@ -36,7 +36,7 @@ mbrs:testComplete a rico:RecordSet ;
rico:title "Videobestand Cathy Sharp Dance Ensemble"@de ;
rico:type "memoriavProject"
] ;
rdau:P60496 "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@fr , "<p>Zugang Memobase</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@it , "<p>Zugang Memobase</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@de ;
rdau:P60496 "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@fr , "<p>Zugang Memobase</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@de , "<p>Zugang Memobase</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@it ;
wdt:P18 "https://mb-wf1.memobase.unibas.ch/sites/default/files/styles/teaser/public/2020-11/StadtArchivSchaffhausenGeb2.jpg?itok=2PsMvPqc" ;
internal:isPublished false ;
rico:conditionsOfAccess "<p>Zugang</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@it , "<p>Zugang</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@fr , "<p>Zugang</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@de ;
......@@ -61,14 +61,14 @@ mbrs:testComplete a rico:RecordSet ;
] ;
rico:heldBy "https://memobase.ch/institution/clg" ;
rico:history "<p>Kontext</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@de , "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@it , "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@fr ;
rico:identifiedBy [ a rico:Identifier ;
rico:identifier "OLD_MEMOBASE_ID" ;
rico:type "oldMemobase"
] ;
rico:identifiedBy [ a rico:Identifier ;
rico:identifier "ORIGINAL_SIGNATUR" ;
rico:type "callNumber"
] ;
rico:identifiedBy [ a rico:Identifier ;
rico:identifier "OLD_MEMOBASE_ID" ;
rico:type "oldMemobase"
] ;
rico:identifiedBy [ a rico:Identifier ;
rico:identifier "ORIGINAL_ID" ;
rico:type "original"
......@@ -85,35 +85,32 @@ mbrs:testComplete a rico:RecordSet ;
[ a rico:Record ;
schema:sameAs "https://example.org/"
] ;
rico:isRecordResourceAssociatedWithRecordResource
[ a rico:RecordSet ;
schema:sameAs "https://example.org"
] ;
rico:isRecordResourceAssociatedWithRecordResource
[ a rico:RecordSet ;
schema:sameAs "http://example.com" ;
rico:title "Verwandte Bestände"@it
] ;
rico:isRecordResourceAssociatedWithRecordResource
[ a rico:RecordSet ;
schema:sameAs "entity:node/120" ;
rico:title "Fernsehbestand Bericht vor Acht / Blickpunkt (Memobase Vorgängerversion)"@it
] ;
rico:isRecordResourceAssociatedWithRecordResource
[ a rico:Record ;
schema:sameAs "https://example.org/"
] ;
rico:isRecordResourceAssociatedWithRecordResource
[ a rico:Record ;
schema:sameAs "https://example.org/" ;
rico:title "Publikation"@it
] ;
rico:isRecordResourceAssociatedWithRecordResource
[ a rico:Record ;
schema:sameAs "https://example.org/"
[ a rico:RecordSet ;
schema:sameAs "http://example.com" ;
rico:title "Verwandte Bestände"@it
] ;
rico:isRecordResourceAssociatedWithRecordResource
[ a rico:RecordSet ;
schema:sameAs "entity:node/43"
schema:sameAs "https://example.org"
] ;
rico:isSubjectOf [ a rico:Record ;
schema:sameAs "https://example.org/"
rico:isRecordResourceAssociatedWithRecordResource
[ a rico:RecordSet ;
schema:sameAs "entity:node/43"
] ;
rico:isSubjectOf [ a rico:Record ;
schema:sameAs "https://example.org/"
......@@ -122,7 +119,10 @@ mbrs:testComplete a rico:RecordSet ;
schema:sameAs "https://example.org/" ;
rico:title "Publikation"@it
] ;
rico:modificationDate "2021-02-10T11:53:00.39"^^xsd:dateTime ;
rico:isSubjectOf [ a rico:Record ;
schema:sameAs "https://example.org/"
] ;
rico:modificationDate "2021-02-11T09:16:13.973"^^xsd:dateTime ;
rico:publicationDate "2020-09-04"^^xsd:date ;
rico:recordResourceExtent "<p>Rechte</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@it , "<p>Rechte</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@fr , "<p>Rechte</p>\r\n\r\n<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"@de ;
rico:recordResourceOrInstantiationIsTargetOfRecordResourceHoldingRelation
......
code,item,de,fr,it
code,id,de,fr,it
VS,http://www.wikidata.org/entity/Q834,Wallis,Valais,Vallese
GL,http://www.wikidata.org/entity/Q11922,Glarus,Glaris,Glarona
BE,http://www.wikidata.org/entity/Q11911,Bern,Berne,Berna
......
code,id,de,fr,it
de,http://www.wikidata.org/entity/Q188,Deutsch,Allemand,Tedesco
fr,http://www.wikidata.org/entity/Q150,Französisch,Français,Francese
it,http://www.wikidata.org/entity/Q652,Italienisch,Italien,Italiano
\ No newline at end of file
This diff is collapsed.
code,wikidata,de,fr,it
de,Q188,Deutsch,Allemand,Tedesco
fr,Q150,Französisch,Français,Francese
it,Q150,Italienisch,Italien,Italiano
\ No newline at end of file
app:
path:
languages: "src/test/resources/meta_data_language_labels.csv"
languages: "src/test/resources/labels/metadata-languages.csv"
cantons: "src/test/resources/labels/cantons.csv"
municipalities: "src/test/resources/labels/municipalities.csv"
kafka:
streams:
bootstrap.servers: localhost:12345
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment