Commit 75cd49ee authored by Jonas Waeber's avatar Jonas Waeber
Browse files

initial base implementation

parent 3dc10a44
FROM gradle:6.3-jdk8
ADD . /
WORKDIR /
RUN gradle --no-daemon --no-scan --no-build-cache distTar
RUN cd /build/distributions && tar xf app.tar
FROM openjdk:8-jre-alpine
COPY --from=0 /build/distributions/app /app
CMD /app/bin/fedora-metadata-extractor
......@@ -629,8 +629,8 @@ to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Fedora Metadata Extractor
Copyright (C) 2020 memoriav / Memobase 2020 / services / postprocessing
record-parser
Copyright (C) 2020 memoriav / Memobase 2020 / services
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
......
# Fedora Metadata Extractor
### Fedora Metadata Extractor
extracts, parses and propagates metadata from Fedora
\ No newline at end of file
plugins {
id 'application'
id 'distribution'
id 'org.jetbrains.kotlin.jvm' version '1.3.71'
id 'com.palantir.git-version' version '0.11.0'
id 'org.jlleitschuh.gradle.ktlint' version '9.2.1'
}
group 'org.memobase'
version = gitVersion()
mainClassName = 'org.memobase.App'
jar {
manifest {
attributes 'Main-Class': 'org.memobase.App'
}
}
sourceCompatibility = 1.8
targetCompatibility = 1.8
repositories {
jcenter()
maven {
url "https://dl.bintray.com/memoriav/memobase"
}
}
ext {
kafkaV = '2.3.1'
log4jV = '2.11.2'
}
dependencies {
implementation 'org.memobase:memobase-service-utilities:1.4.0'
implementation 'org.memobase:fedora-client:0.1.2'
// Logging Framework
implementation "org.apache.logging.log4j:log4j-api:${log4jV}"
implementation "org.apache.logging.log4j:log4j-core:${log4jV}"
// Kafka Imports
implementation "org.apache.kafka:kafka-streams:${kafkaV}"
// RDF Library
implementation 'org.apache.jena:apache-jena:3.14.0'
// JSON Library
implementation 'com.fasterxml.jackson.core:jackson-databind:2.11.+'
implementation 'com.fasterxml.jackson.core:jackson-core:2.11.+'
implementation 'com.fasterxml.jackson.core:jackson-annotations:2.11.+'
implementation "com.fasterxml.jackson.module:jackson-module-kotlin:2.11.+"
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'
implementation "org.jetbrains.kotlin:kotlin-script-runtime:1.3.71"
implementation "org.jetbrains.kotlin:kotlin-reflect:1.3.71"
testCompile("org.junit.jupiter:junit-jupiter:5.4.2")
testImplementation 'org.assertj:assertj-core:3.15.0'
// https://mvnrepository.com/artifact/org.apache.kafka/kafka-streams-test-utils
testCompile group: 'org.apache.kafka', name: 'kafka-streams-test-utils', version: kafkaV
}
compileKotlin {
kotlinOptions.jvmTarget = "1.8"
}
compileTestKotlin {
kotlinOptions.jvmTarget = "1.8"
}
test {
useJUnitPlatform()
testLogging {
events "passed", "skipped", "failed"
}
}
sourceSets {
main.kotlin.srcDirs += 'src/main/kotlin'
main.resources.srcDirs = [ "src/main/resources" ]
main.resources.includes = [ "**/*.yml", "**/*.xml"]
test.kotlin.srcDirs += 'src/test/kotlin'
test.resources.srcDirs = [ "src/test/resources" ]
test.resources.includes = [ "**/*.yml", "**/*.xml"]
}
plugins.withType(DistributionPlugin) {
distTar {
archiveFileName = 'app.tar'
}
}
\ No newline at end of file
apiVersion: v2
name: fedora-metadata-extractor
description: A micro service deployment to extract metadata from fedora based on the event stream.
type: application
version: 0.0.0
appVersion: 0.0.0
maintainers:
- name: Jonas Waeber
email: jonas.waeber@unibas.ch
apiVersion: v1
kind: ConfigMap
metadata:
name: "{{ .Values.deploymentName }}-app-config"
namespace: memobase
data:
APPLICATION_ID: "{{ .Values.deploymentName }}-deployment"
TOPIC_IN: "{{ .Values.inputTopic }}"
TOPIC_OUT: "{{ .Values.outputTopic }}"
TOPIC_PROCESS: "{{ .Values.deploymentName }}-reporting"
\ No newline at end of file
apiVersion: apps/v1
kind: Deployment
metadata:
name: "{{ .Values.deploymentName }}-deployment"
namespace: memobase
labels:
app: "{{ .Values.deploymentName }}-deployment"
spec:
selector:
matchLabels:
app: "{{ .Values.deploymentName }}-deployment"
replicas: 1
template:
metadata:
labels:
app: "{{ .Values.deploymentName }}-deployment"
tier: post-processing
spec:
containers:
- name: "{{ .Values.deploymentName }}-container"
image: "{{.Values.registry}}/{{ .Values.image }}:{{ .Values.tag }}"
imagePullPolicy: Always
envFrom:
- configMapRef:
name: "{{ .Values.kafkaConfigs }}"
- configMapRef:
name: "{{ .Values.deploymentName}}-app-config"
- secretRef:
name: "{{ .Values.fedoraConfigs }}"
restartPolicy: Always
############################################
## Values in this section are the same for #
## all jobs #
############################################
#image values
registry: "cr.gitlab.switch.ch"
image: "memoriav/memobase-2020/services/postprocessing/fedora-metadata-extractor"
tag: "latest"
deploymentName: fedora-metadata-extractor
kafkaConfigs: prod-kafka-bootstrap-servers
fedoraConfigs: prod-fedora-configs
outputTopic: fedora-metadata-extractor-json-output
inputTopic: fedora-events
\ No newline at end of file
kotlin.code.style=official
\ No newline at end of file
#!/usr/bin/env sh
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin, switch paths to Windows format before running java
if $cygwin ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=$((i+1))
done
case $i in
(0) set -- ;;
(1) set -- "$args0" ;;
(2) set -- "$args0" "$args1" ;;
(3) set -- "$args0" "$args1" "$args2" ;;
(4) set -- "$args0" "$args1" "$args2" "$args3" ;;
(5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
(6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
(7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
(8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
(9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=$(save "$@")
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
cd "$(dirname "$0")"
fi
exec "$JAVACMD" "$@"
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS="-Xmx64m"
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto init
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto init
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:init
@rem Get command-line arguments, handling Windows variants
if not "%OS%" == "Windows_NT" goto win9xME_args
:win9xME_args
@rem Slurp the command line arguments.
set CMD_LINE_ARGS=
set _SKIP=2
:win9xME_args_slurp
if "x%~1" == "x" goto execute
set CMD_LINE_ARGS=%*
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega
#!/usr/bin/env bash
echo Creating release: $1
sed -i "s/version: 0.0.0/version: $CI_COMMIT_TAG/g" ./chart/Chart.yaml
sed -i "s/appVersion: 0.0.0/appVersion: $CI_COMMIT_TAG/g" ./chart/Chart.yaml
sed -i "s/tag: \"latest\"/tag: $CI_COMMIT_TAG/g" ./chart/values.yaml
\ No newline at end of file
rootProject.name = 'fedora-metadata-extractor'
/*
* fedora-metadata-extractor
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import kotlin.system.exitProcess
import org.apache.logging.log4j.LogManager
class App {
companion object {
private val log = LogManager.getLogger("FedoraMetadataExtractionApp")
@JvmStatic fun main(args: Array<String>) {
try {
Service().run()
} catch (ex: Exception) {
ex.printStackTrace()
log.error("Stopping application due to error: " + ex.message)
exitProcess(1)
}
}
}
}
/*
* fedora-metadata-extractor
* Copyright (C) 2020 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.memobase
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import org.apache.jena.rdf.model.Model
import org.apache.jena.rdf.model.ModelFactory
import org.apache.jena.rdf.model.Property
import org.apache.jena.rdf.model.Resource
import org.apache.jena.riot.Lang
import org.apache.jena.riot.RDFDataMgr
import org.apache.kafka.streams.StreamsBuilder
import org.apache.kafka.streams.Topology
import org.apache.kafka.streams.kstream.Predicate
import org.apache.logging.log4j.LogManager
import org.memobase.fedora.FedoraClientImpl
import org.memobase.model.EventMessage
import org.memobase.rdf.NS
import org.memobase.rdf.RDF
import org.memobase.rdf.RICO
import org.memobase.settings.SettingsLoader
import java.io.StringWriter
import java.net.URI
import java.nio.charset.StandardCharsets
class KafkaTopology(
private val settings: SettingsLoader
) {
private val log = LogManager.getLogger("FedoraMetadataExtractor")
private val objectMapper = ObjectMapper().registerKotlinModule()
private val fedora = FedoraClientImpl.builder()
.urls(
settings.appSettings.getProperty("fedora.internalDomain"),
settings.appSettings.getProperty("fedora.externalDomain")
)
.credentials(
settings.appSettings.getProperty("fedora.user"),
settings.appSettings.getProperty("fedora.password")
)
.build()
fun build(): Topology {
val builder = StreamsBuilder()
val stream = builder.stream<String, String>(settings.inputTopic)
val objectBranches = stream
.mapValues { value -> parseMessage(value) }
.branch(
// TODO: Add actual values.
Predicate { _, value -> value.objectTypes == "rico:Record" },
Predicate { _, value ->
value.objectTypes == "rico:CorporateBody" || value.objectTypes == "rico:RecordSet" }
)
objectBranches[0]
.mapValues { value -> requestPrimaryResource(value) }
.mapValues { value -> parseModel(value) }
.mapValues { value -> addStatus(value) }
.mapValues { value -> requestAdditionalRecordResources(value) }
.mapValues { value -> writeModel(value) }
.to(settings.outputTopic)
objectBranches[1]
.mapValues { value -> requestPrimaryResource(value) }
.mapValues { value -> parseModel(value) }
.mapValues { value -> addStatus(value) }
.mapValues { value -> writeModel(value) }
.to(settings.outputTopic)
return builder.build()
}
private fun parseMessage(data: String): EventMessage {
// can we assume that this is always correct? or should we handle parse errors?
return objectMapper.readValue(data, EventMessage::class.java)
}
private fun requestPrimaryResource(message: EventMessage): Pair<EventMessage, String> {
return Pair(message, fedora.fetchRdfResourceIntoString(URI.create(message.objectPath), "plain/text"))
}
private fun parseModel(input: Pair<EventMessage, String>): Pair<EventMessage, Model> {
val model = ModelFactory.createDefaultModel()
// TODO: Add exception handling
RDFDataMgr.read(model, input.second.byteInputStream(StandardCharsets.UTF_8), Lang.NT)
return Pair(input.first, model)
}
/**
* Adds the status of the event message to the core resources.
* Core resources are rico:Record, rico:CorporateBody (for institutions) and rico:RecordSet.