Due to a scheduled upgrade to version 14.10, GitLab will be unavailabe on Monday 30.05., from 19:00 until 20:00.

Commit 856d80a4 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Fix bug in xlsx transformation

parent 102ed30d
Pipeline #9929 passed with stages
in 6 minutes and 11 seconds
...@@ -30,6 +30,7 @@ import org.apache.kafka.streams.Topology ...@@ -30,6 +30,7 @@ import org.apache.kafka.streams.Topology
import org.apache.kafka.streams.kstream.KStream import org.apache.kafka.streams.kstream.KStream
import org.apache.kafka.streams.kstream.Predicate import org.apache.kafka.streams.kstream.Predicate
import org.apache.poi.ss.usermodel.CellType import org.apache.poi.ss.usermodel.CellType
import org.apache.poi.ss.usermodel.Row
import org.apache.poi.ss.usermodel.WorkbookFactory import org.apache.poi.ss.usermodel.WorkbookFactory
import org.memobase.settings.SettingsLoader import org.memobase.settings.SettingsLoader
import org.memobase.sftp.SftpClient import org.memobase.sftp.SftpClient
...@@ -79,7 +80,7 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -79,7 +80,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
.mapValues { key, _ -> .mapValues { key, _ ->
Klaxon().toJsonString( Klaxon().toJsonString(
Report( Report(
key, ReportStatus.failure, "table-data-transform", ReportStatus.failure,
ReportMessages.processFailure(key, "the input file is invalid.") ReportMessages.processFailure(key, "the input file is invalid.")
) )
) )
...@@ -139,7 +140,7 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -139,7 +140,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
): Pair<List<Pair<Pair<String, JsonObject>, Report>>, Report> { ): Pair<List<Pair<Pair<String, JsonObject>, Report>>, Report> {
return try { return try {
val result = csvMapper(key, value) val result = csvMapper(key, value)
Pair(result, Report(key, ReportStatus.success, ReportMessages.processSuccess(result.size))) Pair(result, Report("table-data-transform", ReportStatus.success, ReportMessages.processSuccess(result.size)))
} catch (ex: InvalidInputException) { } catch (ex: InvalidInputException) {
Pair( Pair(
listOf( listOf(
...@@ -147,7 +148,7 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -147,7 +148,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
Pair(key, ErrorResult.result), Pair(key, ErrorResult.result),
Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage)) Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage))
) )
), Report(key, ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage)) ), Report("table-data-transform", ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
) )
} }
} }
...@@ -251,7 +252,7 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -251,7 +252,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
): Pair<List<Pair<Pair<String, JsonObject>, Report>>, Report> { ): Pair<List<Pair<Pair<String, JsonObject>, Report>>, Report> {
return try { return try {
val result = excelMapper(key, value) val result = excelMapper(key, value)
Pair(result, Report(key, ReportStatus.success, ReportMessages.processSuccess(result.size))) Pair(result, Report("table-data-transform", ReportStatus.success, ReportMessages.processSuccess(result.size)))
} catch (ex: InvalidInputException) { } catch (ex: InvalidInputException) {
Pair( Pair(
listOf( listOf(
...@@ -259,7 +260,7 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -259,7 +260,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
Pair(key, ErrorResult.result), Pair(key, ErrorResult.result),
Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage)) Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage))
) )
), Report(key, ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage)) ), Report("table-data-transform", ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
) )
} catch (ex: IllegalArgumentException) { // Sheet index does not exist } catch (ex: IllegalArgumentException) { // Sheet index does not exist
Pair( Pair(
...@@ -268,7 +269,7 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -268,7 +269,7 @@ class KafkaTopology(private val settings: SettingsLoader) {
Pair(key, ErrorResult.result), Pair(key, ErrorResult.result),
Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage)) Report(key, ReportStatus.failure, ReportMessages.invalidFile(key, ex.localizedMessage))
) )
), Report(key, ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage)) ), Report("table-data-transform", ReportStatus.failure, ReportMessages.processFailure(key, ex.localizedMessage))
) )
} }
} }
...@@ -357,25 +358,13 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -357,25 +358,13 @@ class KafkaTopology(private val settings: SettingsLoader) {
Report(ex.key, ReportStatus.failure, ReportMessages.reportFailure(ex.localizedMessage)) Report(ex.key, ReportStatus.failure, ReportMessages.reportFailure(ex.localizedMessage))
) )
} }
val jsonObject = json { val jsonObject = json {
obj( obj(
zip( zip(
propertiesList, propertiesList,
row.map { cell -> retrieveCells(row, propertiesList.size - 1)
if (cell != null) { ))
when (cell.cellType) {
CellType.BLANK -> ""
CellType.BOOLEAN -> cell.booleanCellValue.toString()
CellType._NONE -> ""
CellType.NUMERIC -> cell.numericCellValue.toString()
CellType.STRING -> cell.stringCellValue
CellType.FORMULA -> ""
CellType.ERROR -> ""
else -> ""
}
} else ""
})
)
} }
Pair( Pair(
Pair(rowIdentifier, jsonObject), Pair(rowIdentifier, jsonObject),
...@@ -398,4 +387,22 @@ class KafkaTopology(private val settings: SettingsLoader) { ...@@ -398,4 +387,22 @@ class KafkaTopology(private val settings: SettingsLoader) {
private fun reportToJson(value: Report): KeyValue<String, String> { private fun reportToJson(value: Report): KeyValue<String, String> {
return KeyValue(value.id, Klaxon().toJsonString(value)) return KeyValue(value.id, Klaxon().toJsonString(value))
} }
private fun retrieveCells(row: Row, size: Int): List<String> {
return (0..size).map { i ->
val cell = row.getCell(i)
if (cell != null) {
when (cell.cellType) {
CellType.BLANK -> ""
CellType.BOOLEAN -> cell.booleanCellValue.toString()
CellType._NONE -> ""
CellType.NUMERIC -> cell.numericCellValue.toString()
CellType.STRING -> cell.stringCellValue
CellType.FORMULA -> ""
CellType.ERROR -> ""
else -> ""
}
} else ""
}
}
} }
...@@ -21,6 +21,7 @@ import com.beust.klaxon.Klaxon ...@@ -21,6 +21,7 @@ import com.beust.klaxon.Klaxon
import java.io.File import java.io.File
import java.io.FileInputStream import java.io.FileInputStream
import java.nio.charset.Charset import java.nio.charset.Charset
import java.nio.file.Paths
import java.util.stream.Stream import java.util.stream.Stream
import org.apache.kafka.common.serialization.StringDeserializer import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.kafka.common.serialization.StringSerializer import org.apache.kafka.common.serialization.StringSerializer
...@@ -45,28 +46,17 @@ class Tests { ...@@ -45,28 +46,17 @@ class Tests {
private val sftpServer = EmbeddedSftpServer(22000, "user", "password") private val sftpServer = EmbeddedSftpServer(22000, "user", "password")
init { init {
sftpServer.createDirectories( val files = listOf(
"/memobase/test_institution_1/test_record_set_1", Pair("/memobase/test_record_set_1", "brandt_metadaten.csv"),
"/memobase/test_institution_2/test_record_set_2", Pair("/memobase/test_record_set_2", "mapping_baugazh.csv"),
"/memobase/test_institution_2/test_record_set_3", Pair("/memobase/test_record_set_3", "excel_test_file.xlsx"),
"/memobase/test_institution_3/test_record_set_4" Pair("/memobase/test_record_set_4", "excel_test_file2.xlsx"),
) Pair("/memobase/test_record_set_5", "B_MEI_ErfassungMemobase_20170626.xlsx")
sftpServer.putFile(
"/memobase/test_institution_1/test_record_set_1/brandt_metadaten.csv",
FileInputStream("src/test/resources/sftp/brandt_metadaten.csv")
)
sftpServer.putFile(
"/memobase/test_institution_2/test_record_set_2/bauGAZH_metadaten.csv",
FileInputStream("src/test/resources/sftp/mapping_baugazh.csv")
)
sftpServer.putFile(
"/memobase/test_institution_3/test_record_set_3/excel_test_file.xlsx",
FileInputStream("src/test/resources/sftp/excel_test_file.xlsx")
)
sftpServer.putFile(
"/memobase/test_institution_4/test_record_set_4/excel_test_file2.xlsx",
FileInputStream("src/test/resources/sftp/excel_test_file2.xlsx")
) )
for (pair in files) {
sftpServer.putFile(Paths.get(pair.first, pair.second).toString(), FileInputStream(Paths.get("src/test/resources/sftp", pair.second).toFile()))
}
} }
@ParameterizedTest @ParameterizedTest
...@@ -139,7 +129,13 @@ class Tests { ...@@ -139,7 +129,13 @@ class Tests {
"brandt_metadaten.csv", "brandt_metadaten.csv",
listOf("brandt_metadaten.csv"), listOf("brandt_metadaten.csv"),
"error_filter_output", "error_filter_output",
Klaxon().toJsonString(Report("brandt_metadaten.csv", "FAILURE", "Could not process file brandt_metadaten.csv, because the input file is invalid.")) Klaxon().toJsonString(
Report(
"table-data-transform",
"FAILURE",
"Could not process file brandt_metadaten.csv, because the input file is invalid."
)
)
), ),
TestParams( TestParams(
"valid csv input", "valid csv input",
...@@ -148,7 +144,7 @@ class Tests { ...@@ -148,7 +144,7 @@ class Tests {
"brandt_metadaten.csv", "brandt_metadaten.csv",
listOf("AVGR13716"), listOf("AVGR13716"),
"brandt_output", "brandt_output",
Klaxon().toJsonString(Report("brandt_metadaten.csv", "SUCCESS", "Transformed table data into 1 records.")) Klaxon().toJsonString(Report("table-data-transform", "SUCCESS", "Transformed table data into 1 records."))
), ),
TestParams( TestParams(
"invalid xlsx input", "invalid xlsx input",
...@@ -157,7 +153,13 @@ class Tests { ...@@ -157,7 +153,13 @@ class Tests {
"excel_test_file.xlsx", "excel_test_file.xlsx",
listOf("excel_test_file.xlsx"), listOf("excel_test_file.xlsx"),
"excel_output", "excel_output",
Klaxon().toJsonString(Report("excel_test_file.xlsx", "FAILURE", "Could not process file excel_test_file.xlsx, because The property in cell J3 contains one or more invalid characters: [., :, /, +].")) Klaxon().toJsonString(
Report(
"table-data-transform",
"FAILURE",
"Could not process file excel_test_file.xlsx, because The property in cell J3 contains one or more invalid characters: [., :, /, +]."
)
)
), ),
TestParams( TestParams(
"valid xlsx input", "valid xlsx input",
...@@ -166,9 +168,17 @@ class Tests { ...@@ -166,9 +168,17 @@ class Tests {
"excel_test_file2.xlsx", "excel_test_file2.xlsx",
listOf("AVGR13716", "AVGR13717", ""), listOf("AVGR13716", "AVGR13717", ""),
"excel_output_valid", "excel_output_valid",
Klaxon().toJsonString(Report("excel_test_file2.xlsx", "SUCCESS", "Transformed table data into 2 records.")) Klaxon().toJsonString(Report("table-data-transform", "SUCCESS", "Transformed table data into 2 records."))
) )/*,
/*, TestParams(
"example xlsx input",
"test5.yml",
"mei_test.json",
"B_MEI_ErfassungMemobase_20170626.xlsx",
listOf("MEI_49884", "AVGR13717", ""),
"mei_output",
Klaxon().toJsonString(Report("table-data-transform", "SUCCESS", "Transformed table data into 2 records."))
),
TestParams( TestParams(
"valid csv input", "valid csv input",
"test2.yml", "test2.yml",
......
{ {
"path": "/memobase/test_institution_2/test_record_set_2/bauGAZH_metadaten.csv", "path": "/memobase/test_record_set_2/bauGAZH_metadaten.csv",
"format": "CSV" "format": "CSV"
} }
\ No newline at end of file
{ {
"path": "/memobase/test_institution_1/test_record_set_1/brandt_metadaten.csv", "path": "/memobase/test_record_set_1/brandt_metadaten.csv",
"format": "CSV" "format": "CSV"
} }
\ No newline at end of file
{ {
"path": "/memobase/test_institution_3/test_record_set_3/excel_test_file.xlsx", "path": "/memobase/test_record_set_3/excel_test_file.xlsx",
"format": "XLSX" "format": "XLSX"
} }
\ No newline at end of file
{ {
"path": "/memobase/test_institution_4/test_record_set_4/excel_test_file2.xlsx", "path": "/memobase/test_record_set_4/excel_test_file2.xlsx",
"format": "XLSX" "format": "XLSX"
} }
\ No newline at end of file
{"Titel":"«Villa Siegel», Zürich","Beschreibung":"Villa mit Garten und Brunnen im Vordergrund. Vermutlich von Architekt Walz","Genre":"Bauwerk","Aufnahmeort":"Zürich","Autorin":"Atelier Meiner","Auftraggeber":"Walz","Verwandte Dokumente":"Auftragsregister Bd. 6; Bildverzeichnis Bd. 7","Erstellung":"19210914","Rechtinhaber":"BAZ ","Nutzungsrecht":"nach Absprache","Medium":"Negativ Nitrat (NN)","Format":"18x24","Farbe":"sw","Orientation":"Querformat","Zustand":"Nitratschaden","Bewertung":"B","OriginalID":"MEI_49884","AlteSignatur":"49884","Digitalisat":"MEI_49884","Bearbeitungskontext":"Pilot Meiner"}
\ No newline at end of file
{"Exemplar-AVGRNr":"AVGR13717","Permalink":"https://www.gr.ch/Exemplare/13717","Titel-Title":"Schwimmfest am Untersee","Titel-Beschreibung":"Viel Publikum in alter Badi am Untersee \u2013 Wettkämpfe in Wasserball, Schwimmen, Staffel und einzeln - Turmspringen","Titel-ProduktionsjahrdesOriginals":"1920, 1920-1929, genaues Datum nicht eruierbar","Titel-FilmPersonen":"Brandt, Carl","Titel-Funktionen":"Autor/in","Titel-Genre":"Dokumentarfilm; Amateurfilm","Titel-Genres":"Wassersport","Titel-Drehort":"Arosa","Titel-Stream-Url":"https://s3-eu-west-1.amazonaws.com/streaming.av-portal.gr.ch/13717/AVGR13717.mov","Titel-Benutzerzugang":"Intranet","Medium-Materialbezeichnung":"Film","Medium-MedienFormat":"35-mm-Film, Negativ und Positiv, Nitrat","Medium-Ton":"stumm","Medium-Farbe":"s/w getönt","Medium-Dauer":"0.00502314814814815","Medium-Bandlaenge":"Vorhandene Elemente: AVGR9973: Negativ, Nitrat (CS, Z 986-123.5); AVGR9974: Positiv Nitrat (CS, Z 986-122.3); AVGR12099: Internegativ 2234, Kopie 2016 (KBG); AVGR13717: Positivkopie Farbe 2383, Kopie 2016, eingefärbte Sequenzen (KBG)"}
\ No newline at end of file
{"id" : "MEI_49884", "message" : "Successfully transformed row 5 into key-value map with identifier MEI_49884.", "status" : "SUCCESS"}
\ No newline at end of file
{"id" : "AVGR13717", "message" : "Successfully transformed row 5 into key-value map with identifier AVGR13717.", "status" : "SUCCESS"}
\ No newline at end of file
{
"path": "/memobase/test_record_set_5/B_MEI_ErfassungMemobase_20170626.xlsx",
"format": "XLSX"
}
\ No newline at end of file
sftp:
host: localhost
port: 22000
user: user
password: password
app:
sheet: 1
header:
count: 4
line: 3
identifier: 31
kafka:
streams:
bootstrap.servers: localhost:12345
application.id: test-clinet-1234
topic:
in: test-topic-in
out: test-topic-out
process: process-topic-id-reporting
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment