Commit 27726a51 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Implement process reports

Refactors tests & move parser functionality out of KafkaTopology.kt.
parent 6e1f839c
Pipeline #12668 passed with stages
in 5 minutes and 25 seconds
......@@ -11,7 +11,7 @@ test:
tags:
- mbr
script:
- gradle --no-daemon --no-scan --no-build-cache test --fail-fast --tests "org.memobase.Tests"
- gradle --no-daemon --no-scan --no-build-cache test --fail-fast
.build-image:
......
......@@ -24,7 +24,8 @@ import org.apache.logging.log4j.LogManager
class App {
companion object {
private val log = LogManager.getLogger("TableDataTransformApp")
@JvmStatic fun main(args: Array<String>) {
@JvmStatic
fun main(args: Array<String>) {
try {
Service().run()
} catch (ex: Exception) {
......
This diff is collapsed.
......@@ -26,15 +26,15 @@ class Service(file: String = "app.yml") {
private val log = LogManager.getLogger("TableDataService")
val settings = SettingsLoader(
listOf(
"sheet",
"header.count",
"header.line",
"identifier"
),
file,
useStreamsConfig = true,
readSftpSettings = true
listOf(
"sheet",
"header.count",
"header.line",
"identifier"
),
file,
useStreamsConfig = true,
readSftpSettings = true
)
val topology = KafkaTopology(settings).build()
......
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import com.beust.klaxon.json
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import java.io.File
import org.apache.poi.ss.usermodel.CellType
import org.apache.poi.ss.usermodel.WorkbookFactory
import org.memobase.UtilityFunctions.retrieveCellValue
import org.memobase.UtilityFunctions.retrieveCells
import org.memobase.UtilityFunctions.zip
import org.memobase.models.ErrorResult
import org.memobase.models.Formats
import org.memobase.models.Message
import org.memobase.models.ParserResult
import org.memobase.models.ProcessReport
import org.memobase.models.Report
import org.memobase.models.ReportMessages
import org.memobase.models.ReportStatus
import org.memobase.models.ResultMessage
import org.memobase.settings.SettingsLoader
import org.memobase.sftp.SftpClient
class TableParser(settings: SettingsLoader) {
private val sftpClient: SftpClient = SftpClient(settings.sftpSettings)
private val sheetIndex = settings.appSettings.getProperty("sheet").toInt()
private val headerCount = settings.appSettings.getProperty("header.count").toInt()
private val propertyNamesIndex = settings.appSettings.getProperty("header.line").toInt()
private val identifierIndex = settings.appSettings.getProperty("identifier").toInt()
private val invalidPropertyNameCharacters = listOf('.', ':', '/', '+')
fun parseTable(key: String, inputMessage: Message): ParserResult {
return try {
val result: List<ResultMessage> = when (inputMessage.format) {
Formats.xls, Formats.xlsx -> excelMapper(key, inputMessage)
Formats.csv, Formats.tsv -> csvMapper(key, inputMessage)
else -> throw InvalidInputException(key, "Cannot parse the table with format ${inputMessage.format}.")
}
val failureCount = result.count { value -> value.report.status == ReportStatus.failure }
ParserResult(
result,
ProcessReport(
"table-data-transform",
ReportStatus.success,
result.size,
result.size - failureCount,
failureCount
)
)
} catch (ex: InvalidInputException) {
ParserResult(
ex.key, ErrorResult.get(), Report(ex.key, ReportStatus.failure, ex.message ?: "Unknown issue."),
ProcessReport(ReportStatus.failure, 1)
)
} catch (ex: IllegalArgumentException) { // Sheet index does not exist
ParserResult(
key, ErrorResult.get(), Report(key, ReportStatus.failure, ex.localizedMessage),
ProcessReport(ReportStatus.failure, 1)
)
}
}
private fun csvMapper(key: String, value: Message): List<ResultMessage> {
val resultMessages = mutableListOf<ResultMessage>()
val mutableSet = mutableSetOf<String>()
sftpClient.open(File(value.path)).use { inputStream ->
val reader =
csvReader {
this.quoteChar = '"'
this.delimiter = if (value.format == Formats.csv) ',' else '\t'
this.charset = Charsets.UTF_8.displayName()
// this.skipEmptyLine = true
}.readAll(inputStream)
var headerProperties = emptyList<String>()
var count = 0
for (line in reader) {
count += 1
if (count <= headerCount) {
if (count == propertyNamesIndex) {
headerProperties = line
headerProperties.forEachIndexed { index, property ->
val trimmedProperty = property.trim()
if (trimmedProperty.isEmpty()) {
throw InvalidInputException(
key,
"Missing a property name on row $count in column ${index + 1}!"
)
}
if (trimmedProperty.any { value -> invalidPropertyNameCharacters.contains(value) }) {
throw InvalidInputException(
key,
"Invalid property name $trimmedProperty on row $count in column ${index + 1}! You may not use the any of the following characters: + , . "
)
}
}
}
continue
}
// the -1 ensures, that users can start columns beginning at 1!
val identifier: String = try {
line[identifierIndex - 1].let { value ->
when (value) {
"" -> {
throw InvalidInputException(
"$count.$identifierIndex",
"The row $count has an empty identifier in column $identifierIndex."
)
}
in mutableSet -> {
throw InvalidInputException(
"$count.$identifierIndex",
"The row $count contains a duplicated identifier in column $identifierIndex with another row."
)
}
else -> {
mutableSet.add(value)
value
}
}
}
} catch (ex: InvalidInputException) {
resultMessages.add(
ResultMessage(
ex.key,
ErrorResult.get(),
Report(
ex.key,
ReportStatus.failure,
ReportMessages.reportFailure(ex.localizedMessage)
)
)
)
continue
}
val keyValueMap = json {
obj(
zip(headerProperties, line)
)
}
val report = Report(
identifier,
ReportStatus.success,
ReportMessages.reportSuccess(identifier, count)
)
resultMessages.add(ResultMessage(identifier, keyValueMap, report))
}
}
return resultMessages
}
private fun excelMapper(key: String, value: Message): List<ResultMessage> {
return sftpClient.open(File(value.path)).use { inputStream ->
WorkbookFactory.create(inputStream).use { workbook ->
val identifierSet = mutableSetOf<String>()
val propertiesList = mutableListOf<String>()
// sheet index is 0-based. This ensures that users can access sheet 1 with index 1!
val sheet = workbook.getSheetAt(sheetIndex - 1)
var count = 0
sheet.filter { row ->
row.any { cell ->
// filter all rows with no string, boolean or numeric cell
when (cell.cellType) {
CellType._NONE -> false
CellType.NUMERIC -> true
CellType.STRING -> true
CellType.FORMULA -> false
CellType.BLANK -> false
CellType.BOOLEAN -> true
CellType.ERROR -> false
else -> false
}
}
}.map { row ->
count += 1
if (count <= headerCount) {
if (count == propertyNamesIndex) {
propertiesList.addAll(row.map { cell ->
if (retrieveCellValue(cell).isNotEmpty()) {
if (retrieveCellValue(cell).any { char ->
invalidPropertyNameCharacters.contains(
char
)
}) {
throw InvalidInputException(
key,
"The property in cell ${cell.address} contains one or more invalid characters: $invalidPropertyNameCharacters."
)
} else {
retrieveCellValue(cell)
}
} else {
throw InvalidInputException(
key,
"The header index is missing a value in cell ${cell.address}"
)
}
}.map { it.trim() })
}
null
} else {
val rowIdentifier: String = try {
row.getCell(identifierIndex - 1).let { cell ->
if (cell != null) {
when (val cellValue = retrieveCellValue(cell)) {
"" -> {
throw InvalidInputException(
"CellAddress: $count:$identifierIndex",
"The row ${row.rowNum} has an empty identifier in column $identifierIndex."
)
}
in identifierSet -> {
throw InvalidInputException(
"CellAddress: $count:$identifierIndex",
"The row ${row.rowNum} contains a duplicated identifier in column $identifierIndex with another row."
)
}
else -> {
identifierSet.add(cellValue)
cellValue
}
}
} else {
throw InvalidInputException(
"${row.rowNum}.$identifierIndex",
"No cell found in row ${row.rowNum} for column $identifierIndex."
)
}
}
} catch (ex: InvalidInputException) {
return@map ResultMessage(
ex.key, ErrorResult.get(), Report(
ex.key,
ReportStatus.failure,
ReportMessages.reportFailure(ex.localizedMessage)
)
)
}
val jsonObject = json {
obj(
zip(
propertiesList,
retrieveCells(row, propertiesList.size - 1)
)
)
}
ResultMessage(
rowIdentifier, jsonObject, Report(
rowIdentifier,
ReportStatus.success,
ReportMessages.reportSuccess(rowIdentifier, count)
)
)
}
// Empty rows create a null result. These are removed.
}.filterNotNull()
}
}
}
}
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import org.apache.poi.ss.usermodel.Cell
import org.apache.poi.ss.usermodel.CellType
import org.apache.poi.ss.usermodel.Row
object UtilityFunctions {
/**
* Retrieves cells from a row of excel. Restricts the size to the actually
* used part of sheet as otherwise the row is many time larger with many empty cells.
*
* @param row: The row from which to retrieve cells.
* @param size: The number of cells active in the sheet.
*
* @return A list of the cell values as strings.
*/
fun retrieveCells(row: Row, size: Int): List<String> {
return (0..size).map { i ->
val cell = row.getCell(i)
retrieveCellValue(cell)
}
}
/**
* Checks a cell and returns the content as string. If no valid value is found an empty
* string is returned.
*
* In case of a numeric value, the number is either interpreted as a time (if the number is below one) or
* as an integer.
*
* @param cell: A potential cell.
* @return Content of the cell as a string.
*/
fun retrieveCellValue(cell: Cell?): String {
return if (cell != null) {
when (cell.cellType) {
CellType.BLANK -> ""
CellType.BOOLEAN -> cell.booleanCellValue.toString()
CellType._NONE -> ""
CellType.NUMERIC ->
if (cell.numericCellValue >= 1) {
cell.numericCellValue.toLong().toString()
} else {
cell.localDateTimeCellValue.toLocalTime().toString()
}
CellType.STRING -> cell.stringCellValue
CellType.FORMULA -> ""
CellType.ERROR -> ""
else -> ""
}
} else ""
}
/**
* Creates pairs from header + line values in the same column.
*
* @param header: A list of all properties
* @param line: The content of the current line.
*/
fun zip(header: List<String>, line: List<String>): List<Pair<String, String>> {
val result = mutableListOf<Pair<String, String>>()
header.forEachIndexed { index, s ->
if (line[index].isNotEmpty()) {
result.add(Pair(s, line[index].trim()))
}
}
return result
}
}
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.models
import com.beust.klaxon.json
object ErrorResult {
fun get() = json {
obj(Pair("message", Formats.error))
}
}
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.models
object Formats {
const val csv = "CSV"
const val tsv = "TSV"
const val xlsx = "XLSX"
const val xls = "XLS"
const val ods = "ODS"
const val invalid = "INVALID"
const val error = "ERROR"
}
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.models
import com.beust.klaxon.Klaxon
data class Message(
val format: String,
val path: String
) {
fun toJson(): String {
return Klaxon().toJsonString(this)
}
}
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.models
import com.beust.klaxon.JsonObject
data class ParserResult(
val messages: List<ResultMessage>,
val processReport: ProcessReport
) {
constructor(key: String, jsonObject: JsonObject, report: Report, processReport: ProcessReport) : this(
listOf(ResultMessage(key, jsonObject, report)),
processReport
)
}
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase.models
import com.beust.klaxon.Klaxon
data class ProcessReport(
val id: String = "table-data-transform",
val status: String,
val total: Int,
val successes: Int,
val failures: Int
) {
constructor(status: String, total: Int) : this(
"table-data-transform",
status,
total,
if (ReportStatus.success == status) total else 0,
if (ReportStatus.failure == status) total else 0
)
fun toJson(): String {
return Klaxon().toJsonString(this)
}
}
/*
* sftp-reader
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
......@@ -16,25 +16,16 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
package org.memobase.models
import com.beust.klaxon.Klaxon
data class Report(
val id: String,
val status: String,
val message: String