Commit 856d80a4 authored by Jonas Waeber's avatar Jonas Waeber

Fix bug in xlsx transformation

parent 102ed30d
Pipeline #9929 passed with stages
in 6 minutes and 11 seconds
......@@ -30,6 +30,7 @@ import org.apache.kafka.streams.Topology
import org.apache.kafka.streams.kstream.KStream
import org.apache.kafka.streams.kstream.Predicate
import org.apache.poi.ss.usermodel.CellType
import org.apache.poi.ss.usermodel.Row
import org.apache.poi.ss.usermodel.WorkbookFactory
import org.memobase.settings.SettingsLoader
import org.memobase.sftp.SftpClient
......@@ -79,7 +80,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
.mapValues { key, _ ->
Klaxon().toJsonString(
Report(
key, ReportStatus.failure,
"table-data-transform", ReportStatus.failure,
ReportMessages.processFailure(key, "the input file is invalid.")
)
)
......@@ -139,7 +140,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
): Pair<List<Pair<Pair<String, JsonObject>, Report>>, Report> {
return try {
val result = csvMapper(key, value)
Pair(result, Report(key, ReportStatus.success, ReportMessages.processSuccess(result.size)))
Pair(result, Report("table-data-transform", ReportStatus.success, ReportMessages.processSuccess(result.size)))
} catch (ex: InvalidInputException) {
Pair(
listOf(
......@@ -147,7 +148,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
Pair(key, ErrorResult.result),
Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage))
)
), Report(key, ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
), Report("table-data-transform", ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
)
}
}
......@@ -251,7 +252,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
): Pair<List<Pair<Pair<String, JsonObject>, Report>>, Report> {
return try {
val result = excelMapper(key, value)
Pair(result, Report(key, ReportStatus.success, ReportMessages.processSuccess(result.size)))
Pair(result, Report("table-data-transform", ReportStatus.success, ReportMessages.processSuccess(result.size)))
} catch (ex: InvalidInputException) {
Pair(
listOf(
......@@ -259,7 +260,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
Pair(key, ErrorResult.result),
Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage))
)
), Report(key, ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
), Report("table-data-transform", ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
)
} catch (ex: IllegalArgumentException) { // Sheet index does not exist
Pair(
......@@ -268,7 +269,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
Pair(key, ErrorResult.result),
Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage))
)
), Report(key, ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
), Report("table-data-transform", ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
)
}
}
......@@ -357,25 +358,13 @@ class KafkaTopology(private val settings: SettingsLoader) {
Report(ex.key, ReportStatus.failure, ReportMessages.reportFailure(ex.localizedMessage))
)
}
val jsonObject = json {
obj(
zip(
propertiesList,
row.map { cell ->
if (cell != null) {
when (cell.cellType) {
CellType.BLANK -> ""
CellType.BOOLEAN -> cell.booleanCellValue.toString()
CellType._NONE -> ""
CellType.NUMERIC -> cell.numericCellValue.toString()
CellType.STRING -> cell.stringCellValue
CellType.FORMULA -> ""
CellType.ERROR -> ""
else -> ""
}
} else ""
})
)
retrieveCells(row, propertiesList.size - 1)
))
}
Pair(
Pair(rowIdentifier, jsonObject),
......@@ -398,4 +387,22 @@ class KafkaTopology(private val settings: SettingsLoader) {
private fun reportToJson(value: Report): KeyValue<String, String> {
return KeyValue(value.id, Klaxon().toJsonString(value))
}
private fun retrieveCells(row: Row, size: Int): List<String> {
return (0..size).map { i ->
val cell = row.getCell(i)
if (cell != null) {
when (cell.cellType) {
CellType.BLANK -> ""
CellType.BOOLEAN -> cell.booleanCellValue.toString()
CellType._NONE -> ""
CellType.NUMERIC -> cell.numericCellValue.toString()
CellType.STRING -> cell.stringCellValue
CellType.FORMULA -> ""
CellType.ERROR -> ""
else -> ""
}
} else ""
}
}
}
......@@ -21,6 +21,7 @@ import com.beust.klaxon.Klaxon
import java.io.File
import java.io.FileInputStream
import java.nio.charset.Charset
import java.nio.file.Paths
import java.util.stream.Stream
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.kafka.common.serialization.StringSerializer
......@@ -45,28 +46,17 @@ class Tests {
private val sftpServer = EmbeddedSftpServer(22000, "user", "password")
init {
sftpServer.createDirectories(
"/memobase/test_institution_1/test_record_set_1",
"/memobase/test_institution_2/test_record_set_2",
"/memobase/test_institution_2/test_record_set_3",
"/memobase/test_institution_3/test_record_set_4"
)
sftpServer.putFile(
"/memobase/test_institution_1/test_record_set_1/brandt_metadaten.csv",
FileInputStream("src/test/resources/sftp/brandt_metadaten.csv")
)
sftpServer.putFile(
"/memobase/test_institution_2/test_record_set_2/bauGAZH_metadaten.csv",
FileInputStream("src/test/resources/sftp/mapping_baugazh.csv")
)
sftpServer.putFile(
"/memobase/test_institution_3/test_record_set_3/excel_test_file.xlsx",
FileInputStream("src/test/resources/sftp/excel_test_file.xlsx")
)
sftpServer.putFile(
"/memobase/test_institution_4/test_record_set_4/excel_test_file2.xlsx",
FileInputStream("src/test/resources/sftp/excel_test_file2.xlsx")
val files = listOf(
Pair("/memobase/test_record_set_1", "brandt_metadaten.csv"),
Pair("/memobase/test_record_set_2", "mapping_baugazh.csv"),
Pair("/memobase/test_record_set_3", "excel_test_file.xlsx"),
Pair("/memobase/test_record_set_4", "excel_test_file2.xlsx"),
Pair("/memobase/test_record_set_5", "B_MEI_ErfassungMemobase_20170626.xlsx")
)
for (pair in files) {
sftpServer.putFile(Paths.get(pair.first, pair.second).toString(), FileInputStream(Paths.get("src/test/resources/sftp", pair.second).toFile()))
}
}
@ParameterizedTest
......@@ -139,7 +129,13 @@ class Tests {
"brandt_metadaten.csv",
listOf("brandt_metadaten.csv"),
"error_filter_output",
Klaxon().toJsonString(Report("brandt_metadaten.csv", "FAILURE", "Could not process file brandt_metadaten.csv, because the input file is invalid."))
Klaxon().toJsonString(
Report(
"table-data-transform",
"FAILURE",
"Could not process file brandt_metadaten.csv, because the input file is invalid."
)
)
),
TestParams(
"valid csv input",
......@@ -148,7 +144,7 @@ class Tests {
"brandt_metadaten.csv",
listOf("AVGR13716"),
"brandt_output",
Klaxon().toJsonString(Report("brandt_metadaten.csv", "SUCCESS", "Transformed table data into 1 records."))
Klaxon().toJsonString(Report("table-data-transform", "SUCCESS", "Transformed table data into 1 records."))
),
TestParams(
"invalid xlsx input",
......@@ -157,7 +153,13 @@ class Tests {
"excel_test_file.xlsx",
listOf("excel_test_file.xlsx"),
"excel_output",
Klaxon().toJsonString(Report("excel_test_file.xlsx", "FAILURE", "Could not process file excel_test_file.xlsx, because The property in cell J3 contains one or more invalid characters: [., :, /, +]."))
Klaxon().toJsonString(
Report(
"table-data-transform",
"FAILURE",
"Could not process file excel_test_file.xlsx, because The property in cell J3 contains one or more invalid characters: [., :, /, +]."
)
)
),
TestParams(
"valid xlsx input",
......@@ -166,9 +168,17 @@ class Tests {
"excel_test_file2.xlsx",
listOf("AVGR13716", "AVGR13717", ""),
"excel_output_valid",
Klaxon().toJsonString(Report("excel_test_file2.xlsx", "SUCCESS", "Transformed table data into 2 records."))
)
/*,
Klaxon().toJsonString(Report("table-data-transform", "SUCCESS", "Transformed table data into 2 records."))
)/*,
TestParams(
"example xlsx input",
"test5.yml",
"mei_test.json",
"B_MEI_ErfassungMemobase_20170626.xlsx",
listOf("MEI_49884", "AVGR13717", ""),
"mei_output",
Klaxon().toJsonString(Report("table-data-transform", "SUCCESS", "Transformed table data into 2 records."))
),
TestParams(
"valid csv input",
"test2.yml",
......
{
"path": "/memobase/test_institution_2/test_record_set_2/bauGAZH_metadaten.csv",
"path": "/memobase/test_record_set_2/bauGAZH_metadaten.csv",
"format": "CSV"
}
\ No newline at end of file
{
"path": "/memobase/test_institution_1/test_record_set_1/brandt_metadaten.csv",
"path": "/memobase/test_record_set_1/brandt_metadaten.csv",
"format": "CSV"
}
\ No newline at end of file
{
"path": "/memobase/test_institution_3/test_record_set_3/excel_test_file.xlsx",
"path": "/memobase/test_record_set_3/excel_test_file.xlsx",
"format": "XLSX"
}
\ No newline at end of file
{
"path": "/memobase/test_institution_4/test_record_set_4/excel_test_file2.xlsx",
"path": "/memobase/test_record_set_4/excel_test_file2.xlsx",
"format": "XLSX"
}
\ No newline at end of file
{"Titel":"«Villa Siegel», Zürich","Beschreibung":"Villa mit Garten und Brunnen im Vordergrund. Vermutlich von Architekt Walz","Genre":"Bauwerk","Aufnahmeort":"Zürich","Autorin":"Atelier Meiner","Auftraggeber":"Walz","Verwandte Dokumente":"Auftragsregister Bd. 6; Bildverzeichnis Bd. 7","Erstellung":"19210914","Rechtinhaber":"BAZ ","Nutzungsrecht":"nach Absprache","Medium":"Negativ Nitrat (NN)","Format":"18x24","Farbe":"sw","Orientation":"Querformat","Zustand":"Nitratschaden","Bewertung":"B","OriginalID":"MEI_49884","AlteSignatur":"49884","Digitalisat":"MEI_49884","Bearbeitungskontext":"Pilot Meiner"}
\ No newline at end of file
{"Exemplar-AVGRNr":"AVGR13717","Permalink":"https://www.gr.ch/Exemplare/13717","Titel-Title":"Schwimmfest am Untersee","Titel-Beschreibung":"Viel Publikum in alter Badi am Untersee \u2013 Wettkämpfe in Wasserball, Schwimmen, Staffel und einzeln - Turmspringen","Titel-ProduktionsjahrdesOriginals":"1920, 1920-1929, genaues Datum nicht eruierbar","Titel-FilmPersonen":"Brandt, Carl","Titel-Funktionen":"Autor/in","Titel-Genre":"Dokumentarfilm; Amateurfilm","Titel-Genres":"Wassersport","Titel-Drehort":"Arosa","Titel-Stream-Url":"https://s3-eu-west-1.amazonaws.com/streaming.av-portal.gr.ch/13717/AVGR13717.mov","Titel-Benutzerzugang":"Intranet","Medium-Materialbezeichnung":"Film","Medium-MedienFormat":"35-mm-Film, Negativ und Positiv, Nitrat","Medium-Ton":"stumm","Medium-Farbe":"s/w getönt","Medium-Dauer":"0.00502314814814815","Medium-Bandlaenge":"Vorhandene Elemente: AVGR9973: Negativ, Nitrat (CS, Z 986-123.5); AVGR9974: Positiv Nitrat (CS, Z 986-122.3); AVGR12099: Internegativ 2234, Kopie 2016 (KBG); AVGR13717: Positivkopie Farbe 2383, Kopie 2016, eingefärbte Sequenzen (KBG)"}
\ No newline at end of file
{"id" : "MEI_49884", "message" : "Successfully transformed row 5 into key-value map with identifier MEI_49884.", "status" : "SUCCESS"}
\ No newline at end of file
{"id" : "AVGR13717", "message" : "Successfully transformed row 5 into key-value map with identifier AVGR13717.", "status" : "SUCCESS"}
\ No newline at end of file
{
"path": "/memobase/test_record_set_5/B_MEI_ErfassungMemobase_20170626.xlsx",
"format": "XLSX"
}
\ No newline at end of file
sftp:
host: localhost
port: 22000
user: user
password: password
app:
sheet: 1
header:
count: 4
line: 3
identifier: 31
kafka:
streams:
bootstrap.servers: localhost:12345
application.id: test-clinet-1234
topic:
in: test-topic-in
out: test-topic-out
process: process-topic-id-reporting
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment