Commit a9eae909 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Add test for excel reader.

Refactor SFTP Reader to be its own module
Ensure input stream and workbook are closed.
parent ffba1ade
......@@ -33,7 +33,8 @@ import org.memobase.models.ReportMessages
class KafkaTopology(private val settings: SettingsLoader) {
private val step = settings.appSettings.getProperty(Service.reportingStepNamePropName)
private val parser = TableParser(step, settings)
private val reader = SftpReader(settings.sftpSettings)
private val parser = TableParser(step)
private val reportingTopic = settings.processReportTopic
private val klaxon = Klaxon()
private val acceptedFormats = listOf(Formats.csv, Formats.xlsx, Formats.tsv, Formats.xls)
......@@ -55,8 +56,12 @@ class KafkaTopology(private val settings: SettingsLoader) {
val parsedTables = parseMessageStream[1]
.filter { _, value -> acceptedFormats.contains(value.first.format) }
.transformValues(HeaderExtractionTransformSupplier<Pair<Message, Report>>())
.flatMapValues { key, value -> parser.parseTable(key, value.first.first, value.second) }
.mapValues { value -> value.first }
.transformValues(HeaderExtractionTransformSupplier<Message>())
.mapValues { _, value ->
Triple(value.first, value.second, reader.open(value.first.path))
}
.flatMapValues { key, value -> parser.parseTable(key, value.first, value.second, value.third) }
.map { _, value -> KeyValue(value.key, value) }
parsedTables
......
/*
* Table Data Import Service
* Copyright (C) 2020-2021 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import ch.memobase.settings.SftpSettings
import ch.memobase.sftp.SftpClient
import java.io.File
import java.io.InputStream
class SftpReader(settings: SftpSettings) {
private val sftpClient = SftpClient(settings)
fun open(path: String): InputStream {
return sftpClient.open(File(path))
}
}
\ No newline at end of file
......@@ -21,12 +21,10 @@ import ch.memobase.exceptions.InvalidInputException
import ch.memobase.reporting.Report
import ch.memobase.reporting.ReportStatus
import ch.memobase.settings.HeaderMetadata
import ch.memobase.settings.SettingsLoader
import ch.memobase.sftp.SftpClient
import com.beust.klaxon.json
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import com.github.doyaaaaaken.kotlincsv.util.CSVFieldNumDifferentException
import java.io.File
import java.io.InputStream
import org.apache.poi.ss.usermodel.WorkbookFactory
import org.memobase.UtilityFunctions.retrieveCellValue
import org.memobase.UtilityFunctions.retrieveCells
......@@ -36,15 +34,19 @@ import org.memobase.models.Message
import org.memobase.models.ReportMessages
import org.memobase.models.ResultMessage
class TableParser(private val step: String, settings: SettingsLoader) {
private val sftpClient: SftpClient = SftpClient(settings.sftpSettings)
class TableParser(private val step: String) {
private val invalidPropertyNameCharacters = listOf('.', ':', '/', '+')
fun parseTable(key: String, inputMessage: Message, metadata: HeaderMetadata): List<ResultMessage> {
fun parseTable(
key: String,
inputMessage: Message,
metadata: HeaderMetadata,
inputStream: InputStream
): List<ResultMessage> {
return try {
when (inputMessage.format) {
Formats.xls, Formats.xlsx -> excelMapper(key, inputMessage, metadata)
Formats.csv, Formats.tsv -> csvMapper(key, inputMessage, metadata)
Formats.xls, Formats.xlsx -> excelMapper(key, metadata, inputStream)
Formats.csv, Formats.tsv -> csvMapper(key, inputMessage, metadata, inputStream)
else -> throw InvalidInputException("Cannot parse the table with format ${inputMessage.format}.")
}
} catch (ex: CSVFieldNumDifferentException) {
......@@ -100,10 +102,14 @@ class TableParser(private val step: String, settings: SettingsLoader) {
}
}
private fun csvMapper(key: String, value: Message, metadata: HeaderMetadata): List<ResultMessage> {
private fun csvMapper(
key: String,
value: Message,
metadata: HeaderMetadata,
inputStream: InputStream
): List<ResultMessage> {
val resultMessages = mutableListOf<ResultMessage>()
val identifierSet = mutableSetOf<String>()
val inputStream = sftpClient.open(File(value.path))
val reader =
csvReader {
......@@ -186,96 +192,100 @@ class TableParser(private val step: String, settings: SettingsLoader) {
return resultMessages
}
private fun excelMapper(key: String, value: Message, metadata: HeaderMetadata): List<ResultMessage> {
val inputStream = sftpClient.open(File(value.path))
val workbook = WorkbookFactory.create(inputStream)
// only XSL stream closes the input stream. The XSLX stream does not
inputStream.close()
private fun excelMapper(
key: String,
metadata: HeaderMetadata,
inputStream: InputStream
): List<ResultMessage> {
val identifierSet = mutableSetOf<String>()
val propertiesList = mutableListOf<String>()
// sheet index is 0-based. This ensures that users can access sheet 1 with index 1!
val sheet = workbook.getSheetAt(metadata.tableSheetIndex - 1)
var count = 0
return sheet.filterEmptyRows().map { row ->
count += 1
if (count <= metadata.tableHeaderCount) {
if (count == metadata.tableHeaderIndex) {
propertiesList.addAll(row.map { cell ->
if (retrieveCellValue(cell).isNotEmpty()) {
if (retrieveCellValue(cell).any { char ->
invalidPropertyNameCharacters.contains(
char
)
}) {
throw InvalidInputException(
"The property in cell ${cell.address} contains one or more invalid characters: $invalidPropertyNameCharacters."
)
} else {
retrieveCellValue(cell)
}
} else {
throw InvalidInputException(
"The header index is missing a value in cell ${cell.address}"
)
}
}.map { it.trim() })
}
null
} else {
val rowIdentifier: String = try {
row.getCell(metadata.tableIdentifierIndex - 1).let { cell ->
if (cell != null) {
when (val cellValue = retrieveCellValue(cell)) {
"" -> {
inputStream.use {
WorkbookFactory.create(it).use { workbook ->
// sheet index is 0-based. This ensures that users can access sheet 1 with index 1!
val sheet = workbook.getSheetAt(metadata.tableSheetIndex - 1)
var count = 0
return sheet.filterEmptyRows().map { row ->
count += 1
if (count <= metadata.tableHeaderCount) {
if (count == metadata.tableHeaderIndex) {
propertiesList.addAll(row.map { cell ->
if (retrieveCellValue(cell).isNotEmpty()) {
if (retrieveCellValue(cell).any { char ->
invalidPropertyNameCharacters.contains(
char
)
}) {
throw InvalidInputException(
"The property in cell ${cell.address} contains one or more invalid characters: $invalidPropertyNameCharacters."
)
} else {
retrieveCellValue(cell)
}
} else {
throw InvalidInputException(
"The row ${row.rowNum} has an empty identifier in column ${metadata.tableIdentifierIndex}."
"The header index is missing a value in cell ${cell.address}"
)
}
in identifierSet -> {
}.map { it.trim() })
}
null
} else {
val rowIdentifier: String = try {
row.getCell(metadata.tableIdentifierIndex - 1).let { cell ->
if (cell != null) {
when (val cellValue = retrieveCellValue(cell)) {
"" -> {
throw InvalidInputException(
"The row ${row.rowNum} has an empty identifier in column ${metadata.tableIdentifierIndex}."
)
}
in identifierSet -> {
throw InvalidInputException(
"The row ${row.rowNum} contains a duplicated identifier in column ${metadata.tableIdentifierIndex} with another row."
)
}
else -> {
identifierSet.add(cellValue)
cellValue
}
}
} else {
throw InvalidInputException(
"The row ${row.rowNum} contains a duplicated identifier in column ${metadata.tableIdentifierIndex} with another row."
"No cell found in row ${row.rowNum} for column ${metadata.tableIdentifierIndex}."
)
}
else -> {
identifierSet.add(cellValue)
cellValue
}
}
} else {
throw InvalidInputException(
"No cell found in row ${row.rowNum} for column ${metadata.tableIdentifierIndex}."
} catch (ex: InvalidInputException) {
return@map ResultMessage(
key, null, Report(
key,
ReportStatus.fatal,
ReportMessages.fatalInput(ex.localizedMessage),
step
)
)
}
}
} catch (ex: InvalidInputException) {
return@map ResultMessage(
key, null, Report(
key,
ReportStatus.fatal,
ReportMessages.fatalInput(ex.localizedMessage),
step
)
)
}
val jsonObject = json {
obj(
zip(
propertiesList,
retrieveCells(row, propertiesList.size - 1)
val jsonObject = json {
obj(
zip(
propertiesList,
retrieveCells(row, propertiesList.size - 1)
)
)
}
ResultMessage(
rowIdentifier, jsonObject, Report(
rowIdentifier,
ReportStatus.success,
ReportMessages.success(),
step
)
)
)
}
ResultMessage(
rowIdentifier, jsonObject, Report(
rowIdentifier,
ReportStatus.success,
ReportMessages.success(),
step
)
)
}
// Empty rows create a null result. These are removed.
}.filterNotNull()
}
// Empty rows create a null result. These are removed.
}.filterNotNull()
}
}
}
......@@ -63,7 +63,7 @@ object UtilityFunctions {
cell.localDateTimeCellValue.toLocalTime().toString()
}
CellType.STRING -> cell.stringCellValue
CellType.FORMULA -> ""
CellType.FORMULA -> cell.stringCellValue
CellType.ERROR -> ""
else -> ""
}
......
/*
* Table Data Import Service
* Copyright (C) 2020-2021 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import ch.memobase.settings.HeaderMetadata
import java.io.File
import java.io.FileInputStream
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.TestInstance
import org.memobase.models.Message
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
class TestExcelParser {
private val parser = TableParser("test")
private val header = HeaderMetadata(
"",
"",
"",
true,
"",
"",
1,
1,
1,
1
)
private fun reader(filename: String): String {
return FileInputStream(File("src/test/resources/excel/$filename.json")).bufferedReader().lines()
.reduce { t, t2 -> t + t2 }
.orElse("")
}
@Test
fun `test excel file with formulas`() {
val result = parser.parseTable(
"key",
Message("XLSX", ""),
header,
FileInputStream(File("src/test/resources/excel/test1.xlsx"))
)
val output = result[0].value?.toJsonString()
assertThat(output)
.isEqualTo(reader("test1.output"))
}
}
\ No newline at end of file
{"ID":"01","Old_ID":"memoriav-01","Signatur":"a","Typ":"Film","Haupttitel":"Titel Film 01","Serientitel":"Titel Film","Sendungstitel":"Titel Film vom 01-02-1931","Beschreibung":"Beschreibung <br> und vieles mehr","Genre":"Cinegiornale","Schlagworte":"Hund","Sprache_inhalt":"Deutsch","Erw_personen":"Henri, Guisan","Erw_institution":"Swisscom","Bemerkung":"Bemerkung <br> weiteres","Abdeckung_ort":"Irgendwo in Bern","Abdeckung_Zeit":"1990","Dauer":"00:00:01","Creation_date":"01-01-1931","Publishing_date":"01-02-1931","Urheber_person":"Ruth, Dreifuss","Urheber_institution":"PTT","Mitwirkende_person":"Kurt, Aeschbacher (Inteviewer)","Mitwirkende_institution":"SRF","Mitwirkende_agent":"DJ Bobo","Verlag_institution":"Haupt","Verlag_agent":"Fantasie Verlag 123","Produzent_peson":"Vorname, Name","Produzent_agent":"Fantasie Production","Entstehung":"Entstehungsumstände <br> und weiteres","Quelle":"Quelle y<br> und weiteres","Related_material":"Verwandtes Material <br> und weiteres","Sprache_untertitel":"Deutsch","Sprache_metadaten":"de","Rechtinhaber":"Rechteinhaber <br> und weiteres","rightsStatementName_digital":"In Copyright - Educational Use Permitted (InC-EDU)","rightsStatementURL_digital":"http://rightsstatements.org/vocab/InC-EDU/1.0/","Format":"35mm Zellulosenitrat","Farbe":"color","Tech_bemerkung":"Tech Bemerkung <br> weiteres","Zugang_physical":"onsite","Zugang_digital":"public","Backlink":"https://www.recherche.bar.admin.ch/recherche/#/de/suche/archivplan/21690329","Streaming":"https://media.zem.ch/01WS/1975/SFW_1648.mp4","Sponsored":"true"}
\ No newline at end of file
"ok","ok","ok","Umgang definieren. Zusammenzug mit Tetel oder in Bemerkung.","ok","Datum neu anordnen? JahrStart; JahrEnd; Freitext bsp: 1920 1930 ca.","Ausschliesslich Autor(en)","Weitere Beteiligte mit Name Vorname (ROLLE); …","Trenner eher speziell aber für Memobase ok","Trenner und Auflistung eher speziell aber für Memobase ok","Trenner eher speziell aber für Memobase ok",,"Memobase+ mit Freigabe?","-","ok","ok, ergänzt","ok","ok","Memobase verwendet hh:mm:ss. Ist aber ok so. Zeile 4 und 5 haben einen identischen Wert. Zufall? Korrigiert, danke!","neu: vorhandene Elemente"
"ID Original ID",,"Inhalt Haupttitel","Inhalt Beschreibung Bemerkung","Inhalt Beschreibung","Kontext Erstellung","Kontext AutorIn","AutorIn","Inhalt Genre","Inhalt Schlagworte","Kontext Aufnahmeort","Kontext Beschreibung Bemerkung","System Information Streaming",,"System Information Dokumenttyp","Technische Informationen Trägerformat (des Originals)","Technische Informationen Tonaufnahmeverfahren ","Technische Informationen Film Farbe","Technische Informationen Dauer","Technische Informationen Film Bemerkung"
"Exemplar-AVGRNr","Permalink","Titel-Title","Titel-ZusatzTitel","Titel-Beschreibung","Titel-ProduktionsjahrdesOriginals","Titel-FilmPersonen","Titel-Funktionen","Titel-Genre","Titel-Beschreibung","Titel-Drehort","Titel-Weiteres","Titel-Stream-Url","Titel-Benutzerzugang","Medium-Materialbezeichnung","Medium-MedienFormat","Medium-Ton","Medium-Farbe","Medium-Dauer","Medium-Bandlaenge"
"AVGR13716","https://www.gr.ch/Exemplare/13716","Wintersport in Arosa",,"Pferderennen am Obersee bei strahlendem Sonnenschein, viel Publikum, Gedränge vor Wettbüro, Reiter in Armeeuniform, Fotografen, Skijöring – Eisfest mit kostümierten Teilnehmer/innen vor Hotel Altein bei Nacht – Pferderennen am Obersee – Eiskunstlauf – Pferderennen, diesmal winterlicher – Schanzenspringen im Skigelände und viel Volk um die Alpgebäude Carmenna – Skifahrer im Aufstieg, Winterwanderer und nochmals Sprünge auf der Schneeschanze, Gruppe Skifahrer in wilder Schussfahrt, Wartende um die Hütten – Eishockey-Match – Impressionen von einem Abfahrtsrennen und Rundsicht über Arosa und Umgebung","1920, 1920-1929, genaues Datum nicht eruierbar","Brandt, Carl","Autor/in","Dokumentarfilm; Amateurfilm","Pferdesport; Ski alpin; Skispringen; Eishochey; Tourismus","Arosa","Eisfest: teilweise identische Aufnahmen in AVGR12097 „Ankunft David Zogg“ ; Schanzenspringen auf Carmenna: teilweise identische Aufnahmen in AVGR12115 „Touristen auf dem Tschuggen“","https://s3-eu-west-1.amazonaws.com/streaming.av-portal.gr.ch/13716/AVGR13716.mov","Intranet","Film","35-mm-Film, Negativ und Positiv, Nitrat","stumm","s/w getönt",0:17:02,"Vorhandene Elemente: AVGR9942: Negativ, Nitrat (CS, Z 986-172.8); AVGR9943: Positiv Nitrat (CS, Z 986-172.7); AVGR12098: Interpositiv / Marron 2366, Kopie 2016 (KBG); AVGR13715: Internegativ 2234, Kopie 2016 (KBG); AVGR13716: Positivkopie Farbe 2383, Kopie 2016, eingefärbte Sequenzen (KBG)"
\ No newline at end of file
"ok" ,"ok" ,"ok" ,"Umgang definieren. Zusammenzug mit Tetel oder in Bemerkung.","ok" ,"Datum neu anordnen? JahrStart; JahrEnd; Freitext bsp: 1920 1930 ca.","Ausschliesslich Autor(en)","Weitere Beteiligte mit Name Vorname (ROLLE); …","Trenner eher speziell aber für Memobase ok","Trenner und Auflistung eher speziell aber für Memobase ok" ,"Trenner eher speziell aber für Memobase ok", ,"Memobase+ mit Freigabe?" ,"-" ,"ok" ,"ok, ergänzt" ,"ok" ,"ok" ,"Memobase verwendet hh:mm:ss. Ist aber ok so. Zeile 4 und 5 haben einen identischen Wert. Zufall? Korrigiert, danke!","neu: vorhandene Elemente"
"ID Original ID" , ,"Inhalt Haupttitel" ,"Inhalt Beschreibung Bemerkung" ,"Inhalt Beschreibung" ,"Kontext Erstellung" ,"Kontext AutorIn" ,"AutorIn" ,"Inhalt Genre" ,"Inhalt Schlagworte" ,"Kontext Aufnahmeort" ,"Kontext Beschreibung Bemerkung" ,"System Information Streaming" , ,"System Information Dokumenttyp","Technische Informationen Trägerformat (des Originals)","Technische Informationen Tonaufnahmeverfahren ","Technische Informationen Film Farbe","Technische Informationen Dauer" ,"Technische Informationen Film Bemerkung"
"Exemplar-AVGRNr","Permalink" ,"Titel-Title" ,"Titel-ZusatzTitel" ,"Titel-Beschreibung" ,"Titel-ProduktionsjahrdesOriginals" ,"Titel-FilmPersonen" ,"Titel-Funktionen" ,"Titel-Genre" ,"Titel-Beschreibung" ,"Titel-Drehort" ,"Titel-Weiteres" ,"Titel-Stream-Url" ,"Titel-Benutzerzugang","Medium-Materialbezeichnung" ,"Medium-MedienFormat" ,"Medium-Ton" ,"Medium-Farbe" ,"Medium-Dauer" ,"Medium-Bandlaenge"
"AVGR13716" ,"https://www.gr.ch/Exemplare/13716","Wintersport in Arosa", ,"Pferderennen am Obersee bei strahlendem Sonnenschein, viel Publikum, Gedränge vor Wettbüro, Reiter in Armeeuniform, Fotografen, Skijöring – Eisfest mit kostümierten Teilnehmer/innen vor Hotel Altein bei Nacht – Pferderennen am Obersee – Eiskunstlauf – Pferderennen, diesmal winterlicher – Schanzenspringen im Skigelände und viel Volk um die Alpgebäude Carmenna – Skifahrer im Aufstieg, Winterwanderer und nochmals Sprünge auf der Schneeschanze, Gruppe Skifahrer in wilder Schussfahrt, Wartende um die Hütten – Eishockey-Match – Impressionen von einem Abfahrtsrennen und Rundsicht über Arosa und Umgebung","1920, 1920-1929, genaues Datum nicht eruierbar" ,"Brandt, Carl" ,"Autor/in" ,"Dokumentarfilm; Amateurfilm" ,"Pferdesport; Ski alpin; Skispringen; Eishochey; Tourismus","Arosa" ,"Eisfest: teilweise identische Aufnahmen in AVGR12097 „Ankunft David Zogg“ ; Schanzenspringen auf Carmenna: teilweise identische Aufnahmen in AVGR12115 „Touristen auf dem Tschuggen“","https://s3-eu-west-1.amazonaws.com/streaming.av-portal.gr.ch/13716/AVGR13716.mov","Intranet" ,"Film" ,"35-mm-Film, Negativ und Positiv, Nitrat" ,"stumm" ,"s/w getönt" ,0:17:02 ,"Vorhandene Elemente: AVGR9942: Negativ, Nitrat (CS, Z 986-172.8); AVGR9943: Positiv Nitrat (CS, Z 986-172.7); AVGR12098: Interpositiv / Marron 2366, Kopie 2016 (KBG); AVGR13715: Internegativ 2234, Kopie 2016 (KBG); AVGR13716: Positivkopie Farbe 2383, Kopie 2016, eingefärbte Sequenzen (KBG)"
\ No newline at end of file
,,,,,
,,,,,
id,name,text,stuff,stoff,steff
, , , , ,
, , , , ,
id ,name ,text ,stuff ,stoff ,steff
test-id-1,name-1,text-1,stuff-1,stoff-1,steff-1
test-id-2,name-2,text-2,stuff-2,stoff-2,steff-2
\ No newline at end of file
,,,,,
,,,,
id,name,text,stuff,stoff,steff
, , , , ,
, , , ,
id ,name ,text ,stuff ,stoff ,steff
test-id-1,name-1,stuff-1,stoff-1,steff-1
test-id-2,name-2,text-2,stuff-2,stoff-2,steff-2
\ No newline at end of file
test-id-2,name-2,text-2 ,stuff-2,stoff-2,steff-2
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment