Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
memoriav
Memobase 2020
services
Import Process
Table Data Transform
Commits
27726a51
Commit
27726a51
authored
Aug 11, 2020
by
Jonas Waeber
Browse files
Implement process reports
Refactors tests & move parser functionality out of KafkaTopology.kt.
parent
6e1f839c
Pipeline
#12668
passed with stages
in 5 minutes and 25 seconds
Changes
35
Pipelines
2
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
27726a51
...
...
@@ -11,7 +11,7 @@ test:
tags
:
-
mbr
script
:
-
gradle --no-daemon --no-scan --no-build-cache test --fail-fast
--tests "org.memobase.Tests"
-
gradle --no-daemon --no-scan --no-build-cache test --fail-fast
.build-image
:
...
...
src/main/kotlin/App.kt
View file @
27726a51
...
...
@@ -24,7 +24,8 @@ import org.apache.logging.log4j.LogManager
class
App
{
companion
object
{
private
val
log
=
LogManager
.
getLogger
(
"TableDataTransformApp"
)
@JvmStatic
fun
main
(
args
:
Array
<
String
>)
{
@JvmStatic
fun
main
(
args
:
Array
<
String
>)
{
try
{
Service
().
run
()
}
catch
(
ex
:
Exception
)
{
...
...
src/main/kotlin/KafkaTopology.kt
View file @
27726a51
This diff is collapsed.
Click to expand it.
src/main/kotlin/Service.kt
View file @
27726a51
...
...
@@ -26,15 +26,15 @@ class Service(file: String = "app.yml") {
private
val
log
=
LogManager
.
getLogger
(
"TableDataService"
)
val
settings
=
SettingsLoader
(
listOf
(
"sheet"
,
"header.count"
,
"header.line"
,
"identifier"
),
file
,
useStreamsConfig
=
true
,
readSftpSettings
=
true
listOf
(
"sheet"
,
"header.count"
,
"header.line"
,
"identifier"
),
file
,
useStreamsConfig
=
true
,
readSftpSettings
=
true
)
val
topology
=
KafkaTopology
(
settings
).
build
()
...
...
src/main/kotlin/TableParser.kt
0 → 100644
View file @
27726a51
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase
import
com.beust.klaxon.json
import
com.github.doyaaaaaken.kotlincsv.dsl.csvReader
import
java.io.File
import
org.apache.poi.ss.usermodel.CellType
import
org.apache.poi.ss.usermodel.WorkbookFactory
import
org.memobase.UtilityFunctions.retrieveCellValue
import
org.memobase.UtilityFunctions.retrieveCells
import
org.memobase.UtilityFunctions.zip
import
org.memobase.models.ErrorResult
import
org.memobase.models.Formats
import
org.memobase.models.Message
import
org.memobase.models.ParserResult
import
org.memobase.models.ProcessReport
import
org.memobase.models.Report
import
org.memobase.models.ReportMessages
import
org.memobase.models.ReportStatus
import
org.memobase.models.ResultMessage
import
org.memobase.settings.SettingsLoader
import
org.memobase.sftp.SftpClient
class
TableParser
(
settings
:
SettingsLoader
)
{
private
val
sftpClient
:
SftpClient
=
SftpClient
(
settings
.
sftpSettings
)
private
val
sheetIndex
=
settings
.
appSettings
.
getProperty
(
"sheet"
).
toInt
()
private
val
headerCount
=
settings
.
appSettings
.
getProperty
(
"header.count"
).
toInt
()
private
val
propertyNamesIndex
=
settings
.
appSettings
.
getProperty
(
"header.line"
).
toInt
()
private
val
identifierIndex
=
settings
.
appSettings
.
getProperty
(
"identifier"
).
toInt
()
private
val
invalidPropertyNameCharacters
=
listOf
(
'.'
,
':'
,
'/'
,
'+'
)
fun
parseTable
(
key
:
String
,
inputMessage
:
Message
):
ParserResult
{
return
try
{
val
result
:
List
<
ResultMessage
>
=
when
(
inputMessage
.
format
)
{
Formats
.
xls
,
Formats
.
xlsx
->
excelMapper
(
key
,
inputMessage
)
Formats
.
csv
,
Formats
.
tsv
->
csvMapper
(
key
,
inputMessage
)
else
->
throw
InvalidInputException
(
key
,
"Cannot parse the table with format ${inputMessage.format}."
)
}
val
failureCount
=
result
.
count
{
value
->
value
.
report
.
status
==
ReportStatus
.
failure
}
ParserResult
(
result
,
ProcessReport
(
"table-data-transform"
,
ReportStatus
.
success
,
result
.
size
,
result
.
size
-
failureCount
,
failureCount
)
)
}
catch
(
ex
:
InvalidInputException
)
{
ParserResult
(
ex
.
key
,
ErrorResult
.
get
(),
Report
(
ex
.
key
,
ReportStatus
.
failure
,
ex
.
message
?:
"Unknown issue."
),
ProcessReport
(
ReportStatus
.
failure
,
1
)
)
}
catch
(
ex
:
IllegalArgumentException
)
{
// Sheet index does not exist
ParserResult
(
key
,
ErrorResult
.
get
(),
Report
(
key
,
ReportStatus
.
failure
,
ex
.
localizedMessage
),
ProcessReport
(
ReportStatus
.
failure
,
1
)
)
}
}
private
fun
csvMapper
(
key
:
String
,
value
:
Message
):
List
<
ResultMessage
>
{
val
resultMessages
=
mutableListOf
<
ResultMessage
>()
val
mutableSet
=
mutableSetOf
<
String
>()
sftpClient
.
open
(
File
(
value
.
path
)).
use
{
inputStream
->
val
reader
=
csvReader
{
this
.
quoteChar
=
'"'
this
.
delimiter
=
if
(
value
.
format
==
Formats
.
csv
)
','
else
'\t'
this
.
charset
=
Charsets
.
UTF_8
.
displayName
()
// this.skipEmptyLine = true
}.
readAll
(
inputStream
)
var
headerProperties
=
emptyList
<
String
>()
var
count
=
0
for
(
line
in
reader
)
{
count
+=
1
if
(
count
<=
headerCount
)
{
if
(
count
==
propertyNamesIndex
)
{
headerProperties
=
line
headerProperties
.
forEachIndexed
{
index
,
property
->
val
trimmedProperty
=
property
.
trim
()
if
(
trimmedProperty
.
isEmpty
())
{
throw
InvalidInputException
(
key
,
"Missing a property name on row $count in column ${index + 1}!"
)
}
if
(
trimmedProperty
.
any
{
value
->
invalidPropertyNameCharacters
.
contains
(
value
)
})
{
throw
InvalidInputException
(
key
,
"Invalid property name $trimmedProperty on row $count in column ${index + 1}! You may not use the any of the following characters: + , . "
)
}
}
}
continue
}
// the -1 ensures, that users can start columns beginning at 1!
val
identifier
:
String
=
try
{
line
[
identifierIndex
-
1
].
let
{
value
->
when
(
value
)
{
""
->
{
throw
InvalidInputException
(
"$count.$identifierIndex"
,
"The row $count has an empty identifier in column $identifierIndex."
)
}
in
mutableSet
->
{
throw
InvalidInputException
(
"$count.$identifierIndex"
,
"The row $count contains a duplicated identifier in column $identifierIndex with another row."
)
}
else
->
{
mutableSet
.
add
(
value
)
value
}
}
}
}
catch
(
ex
:
InvalidInputException
)
{
resultMessages
.
add
(
ResultMessage
(
ex
.
key
,
ErrorResult
.
get
(),
Report
(
ex
.
key
,
ReportStatus
.
failure
,
ReportMessages
.
reportFailure
(
ex
.
localizedMessage
)
)
)
)
continue
}
val
keyValueMap
=
json
{
obj
(
zip
(
headerProperties
,
line
)
)
}
val
report
=
Report
(
identifier
,
ReportStatus
.
success
,
ReportMessages
.
reportSuccess
(
identifier
,
count
)
)
resultMessages
.
add
(
ResultMessage
(
identifier
,
keyValueMap
,
report
))
}
}
return
resultMessages
}
private
fun
excelMapper
(
key
:
String
,
value
:
Message
):
List
<
ResultMessage
>
{
return
sftpClient
.
open
(
File
(
value
.
path
)).
use
{
inputStream
->
WorkbookFactory
.
create
(
inputStream
).
use
{
workbook
->
val
identifierSet
=
mutableSetOf
<
String
>()
val
propertiesList
=
mutableListOf
<
String
>()
// sheet index is 0-based. This ensures that users can access sheet 1 with index 1!
val
sheet
=
workbook
.
getSheetAt
(
sheetIndex
-
1
)
var
count
=
0
sheet
.
filter
{
row
->
row
.
any
{
cell
->
// filter all rows with no string, boolean or numeric cell
when
(
cell
.
cellType
)
{
CellType
.
_NONE
->
false
CellType
.
NUMERIC
->
true
CellType
.
STRING
->
true
CellType
.
FORMULA
->
false
CellType
.
BLANK
->
false
CellType
.
BOOLEAN
->
true
CellType
.
ERROR
->
false
else
->
false
}
}
}.
map
{
row
->
count
+=
1
if
(
count
<=
headerCount
)
{
if
(
count
==
propertyNamesIndex
)
{
propertiesList
.
addAll
(
row
.
map
{
cell
->
if
(
retrieveCellValue
(
cell
).
isNotEmpty
())
{
if
(
retrieveCellValue
(
cell
).
any
{
char
->
invalidPropertyNameCharacters
.
contains
(
char
)
})
{
throw
InvalidInputException
(
key
,
"The property in cell ${cell.address} contains one or more invalid characters: $invalidPropertyNameCharacters."
)
}
else
{
retrieveCellValue
(
cell
)
}
}
else
{
throw
InvalidInputException
(
key
,
"The header index is missing a value in cell ${cell.address}"
)
}
}.
map
{
it
.
trim
()
})
}
null
}
else
{
val
rowIdentifier
:
String
=
try
{
row
.
getCell
(
identifierIndex
-
1
).
let
{
cell
->
if
(
cell
!=
null
)
{
when
(
val
cellValue
=
retrieveCellValue
(
cell
))
{
""
->
{
throw
InvalidInputException
(
"CellAddress: $count:$identifierIndex"
,
"The row ${row.rowNum} has an empty identifier in column $identifierIndex."
)
}
in
identifierSet
->
{
throw
InvalidInputException
(
"CellAddress: $count:$identifierIndex"
,
"The row ${row.rowNum} contains a duplicated identifier in column $identifierIndex with another row."
)
}
else
->
{
identifierSet
.
add
(
cellValue
)
cellValue
}
}
}
else
{
throw
InvalidInputException
(
"${row.rowNum}.$identifierIndex"
,
"No cell found in row ${row.rowNum} for column $identifierIndex."
)
}
}
}
catch
(
ex
:
InvalidInputException
)
{
return
@map
ResultMessage
(
ex
.
key
,
ErrorResult
.
get
(),
Report
(
ex
.
key
,
ReportStatus
.
failure
,
ReportMessages
.
reportFailure
(
ex
.
localizedMessage
)
)
)
}
val
jsonObject
=
json
{
obj
(
zip
(
propertiesList
,
retrieveCells
(
row
,
propertiesList
.
size
-
1
)
)
)
}
ResultMessage
(
rowIdentifier
,
jsonObject
,
Report
(
rowIdentifier
,
ReportStatus
.
success
,
ReportMessages
.
reportSuccess
(
rowIdentifier
,
count
)
)
)
}
// Empty rows create a null result. These are removed.
}.
filterNotNull
()
}
}
}
}
src/main/kotlin/UtilityFunctions.kt
0 → 100644
View file @
27726a51
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase
import
org.apache.poi.ss.usermodel.Cell
import
org.apache.poi.ss.usermodel.CellType
import
org.apache.poi.ss.usermodel.Row
object
UtilityFunctions
{
/**
* Retrieves cells from a row of excel. Restricts the size to the actually
* used part of sheet as otherwise the row is many time larger with many empty cells.
*
* @param row: The row from which to retrieve cells.
* @param size: The number of cells active in the sheet.
*
* @return A list of the cell values as strings.
*/
fun
retrieveCells
(
row
:
Row
,
size
:
Int
):
List
<
String
>
{
return
(
0
..
size
).
map
{
i
->
val
cell
=
row
.
getCell
(
i
)
retrieveCellValue
(
cell
)
}
}
/**
* Checks a cell and returns the content as string. If no valid value is found an empty
* string is returned.
*
* In case of a numeric value, the number is either interpreted as a time (if the number is below one) or
* as an integer.
*
* @param cell: A potential cell.
* @return Content of the cell as a string.
*/
fun
retrieveCellValue
(
cell
:
Cell
?):
String
{
return
if
(
cell
!=
null
)
{
when
(
cell
.
cellType
)
{
CellType
.
BLANK
->
""
CellType
.
BOOLEAN
->
cell
.
booleanCellValue
.
toString
()
CellType
.
_NONE
->
""
CellType
.
NUMERIC
->
if
(
cell
.
numericCellValue
>=
1
)
{
cell
.
numericCellValue
.
toLong
().
toString
()
}
else
{
cell
.
localDateTimeCellValue
.
toLocalTime
().
toString
()
}
CellType
.
STRING
->
cell
.
stringCellValue
CellType
.
FORMULA
->
""
CellType
.
ERROR
->
""
else
->
""
}
}
else
""
}
/**
* Creates pairs from header + line values in the same column.
*
* @param header: A list of all properties
* @param line: The content of the current line.
*/
fun
zip
(
header
:
List
<
String
>,
line
:
List
<
String
>):
List
<
Pair
<
String
,
String
>>
{
val
result
=
mutableListOf
<
Pair
<
String
,
String
>>()
header
.
forEachIndexed
{
index
,
s
->
if
(
line
[
index
].
isNotEmpty
())
{
result
.
add
(
Pair
(
s
,
line
[
index
].
trim
()))
}
}
return
result
}
}
src/main/kotlin/models/ErrorResult.kt
0 → 100644
View file @
27726a51
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase.models
import
com.beust.klaxon.json
object
ErrorResult
{
fun
get
()
=
json
{
obj
(
Pair
(
"message"
,
Formats
.
error
))
}
}
src/main/kotlin/models/Formats.kt
0 → 100644
View file @
27726a51
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase.models
object
Formats
{
const
val
csv
=
"CSV"
const
val
tsv
=
"TSV"
const
val
xlsx
=
"XLSX"
const
val
xls
=
"XLS"
const
val
ods
=
"ODS"
const
val
invalid
=
"INVALID"
const
val
error
=
"ERROR"
}
src/main/kotlin/models/Message.kt
0 → 100644
View file @
27726a51
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase.models
import
com.beust.klaxon.Klaxon
data class
Message
(
val
format
:
String
,
val
path
:
String
)
{
fun
toJson
():
String
{
return
Klaxon
().
toJsonString
(
this
)
}
}
src/main/kotlin/models/ParserResult.kt
0 → 100644
View file @
27726a51
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase.models
import
com.beust.klaxon.JsonObject
data class
ParserResult
(
val
messages
:
List
<
ResultMessage
>,
val
processReport
:
ProcessReport
)
{
constructor
(
key
:
String
,
jsonObject
:
JsonObject
,
report
:
Report
,
processReport
:
ProcessReport
)
:
this
(
listOf
(
ResultMessage
(
key
,
jsonObject
,
report
)),
processReport
)
}
src/main/kotlin/models/ProcessReport.kt
0 → 100644
View file @
27726a51
/*
* Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase.models
import
com.beust.klaxon.Klaxon
data class
ProcessReport
(
val
id
:
String
=
"table-data-transform"
,
val
status
:
String
,
val
total
:
Int
,
val
successes
:
Int
,
val
failures
:
Int
)
{
constructor
(
status
:
String
,
total
:
Int
)
:
this
(
"table-data-transform"
,
status
,
total
,
if
(
ReportStatus
.
success
==
status
)
total
else
0
,
if
(
ReportStatus
.
failure
==
status
)
total
else
0
)
fun
toJson
():
String
{
return
Klaxon
().
toJsonString
(
this
)
}
}
src/main/kotlin/Report.kt
→
src/main/kotlin/
models/
Report.kt
View file @
27726a51
/*
*
sftp-reader
*
Table Data Import Service
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
...
...
@@ -16,25 +16,16 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package
org.memobase
package
org.memobase.models
import
com.beust.klaxon.Klaxon
data class
Report
(
val
id
:
String
,
val
status
:
String
,
val
message
:
String
val
id
:
String
,
val
status
:
String
,