Verified Commit 538f718c authored by Sebastian Schüpbach's avatar Sebastian Schüpbach
Browse files

fix error message

parent 54b88057
Pipeline #35609 passed with stages
in 1 minute and 50 seconds
...@@ -20,11 +20,11 @@ import os ...@@ -20,11 +20,11 @@ import os
from mediametadatatodb_app.resources.MediametadataToDB import MediametadataToDB from mediametadatatodb_app.resources.MediametadataToDB import MediametadataToDB
if __name__ == "__main__": if __name__ == "__main__":
numeric_level = getattr(logging, os.getenv('LOG_LEVEL').upper(), None) numeric_level = getattr(logging, os.getenv("LOG_LEVEL").upper(), None)
if not isinstance(numeric_level, int): if not isinstance(numeric_level, int):
raise ValueError(f'Invalid log level: {os.getenv("LOG_LEVEL")}') raise ValueError(f'Invalid log level: {os.getenv("LOG_LEVEL")}')
logging.basicConfig( logging.basicConfig(
format='%(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', format="%(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
level=numeric_level, level=numeric_level,
) )
logging.info("Starting up") logging.info("Starting up")
......
...@@ -38,26 +38,26 @@ def _connect_to_kafka(retries=0): ...@@ -38,26 +38,26 @@ def _connect_to_kafka(retries=0):
""" """
try: try:
consumer = KafkaConsumer( consumer = KafkaConsumer(
os.environ['TOPIC_IN'], os.environ["TOPIC_IN"],
value_deserializer=lambda m: json.loads(m.decode('utf8')), value_deserializer=lambda m: json.loads(m.decode("utf8")),
bootstrap_servers=os.environ['KAFKA_BOOTSTRAP_SERVERS'], bootstrap_servers=os.environ["KAFKA_BOOTSTRAP_SERVERS"],
auto_offset_reset='earliest', auto_offset_reset="earliest",
enable_auto_commit=False, enable_auto_commit=False,
group_id=os.environ['GROUP_ID'], group_id=os.environ["GROUP_ID"],
consumer_timeout_ms=30000, consumer_timeout_ms=30000,
) )
return consumer return consumer
except KafkaError as ex: except KafkaError as ex:
status = 'KafkaError: ' + str(ex) status = "KafkaError: " + str(ex)
logging.error(status) logging.error(status)
if retries < int(os.environ['KAFKA_CONNECTION_RETRIES']): if retries < int(os.environ["KAFKA_CONNECTION_RETRIES"]):
time.sleep(30 * (retries + 1)) time.sleep(30 * (retries + 1))
_connect_to_kafka(retries + 1) _connect_to_kafka(retries + 1)
exit(1) exit(1)
except Exception as ex: except Exception as ex:
status = 'Exception: ' + str(ex) status = "Exception: " + str(ex)
logging.error(status) logging.error(status)
if retries < int(os.environ['KAFKA_CONNECTION_RETRIES']): if retries < int(os.environ["KAFKA_CONNECTION_RETRIES"]):
time.sleep(30 * (retries + 1)) time.sleep(30 * (retries + 1))
_connect_to_kafka(retries + 1) _connect_to_kafka(retries + 1)
exit(1) exit(1)
...@@ -69,16 +69,16 @@ def _extract_fields( ...@@ -69,16 +69,16 @@ def _extract_fields(
""" """
Extract fields from JSON object by applying `fetch_from_obj_fun` function Extract fields from JSON object by applying `fetch_from_obj_fun` function
""" """
if 'locator' in record_json_data and '@id' in record_json_data: if "locator" in record_json_data and "@id" in record_json_data:
return fetch_from_obj_fun(record_json_data, access_status), None return fetch_from_obj_fun(record_json_data, access_status), None
elif '@id' in record_json_data: elif "@id" in record_json_data:
logging.info( logging.info(
'Record ' + record_json_data['@id'] + ' does not have a locator property.' "Record " + record_json_data["@id"] + " does not have a locator property."
) )
return dict(), 'No locator property found' return dict(), "No locator property found"
else: else:
logging.warning('Record without @id-property detected!') logging.warning("Record without @id-property detected!")
return dict(), 'No @id property found' return dict(), "No @id property found"
def _extract_thumbnail_values(msg, _access_status) -> dict: def _extract_thumbnail_values(msg, _access_status) -> dict:
...@@ -87,33 +87,33 @@ def _extract_thumbnail_values(msg, _access_status) -> dict: ...@@ -87,33 +87,33 @@ def _extract_thumbnail_values(msg, _access_status) -> dict:
""" """
del _access_status del _access_status
return_values = { return_values = {
'type': 'image', "type": "image",
'access': 'public', "access": "public",
'proto': 'file', "proto": "file",
'sig': '{}-poster'.format(msg['@id'].split('/')[-2]), "sig": "{}-poster".format(msg["@id"].split("/")[-2]),
} }
if 'height' in msg: if "height" in msg:
height = _normalize_dimension(msg['height']) height = _normalize_dimension(msg["height"])
return_values['height'] = height return_values["height"] = height
if 'width' in msg: if "width" in msg:
width = _normalize_dimension(msg['width']) width = _normalize_dimension(msg["width"])
return_values['width'] = width return_values["width"] = width
if 'hasMimeType' in msg: if "hasMimeType" in msg:
return_values['mimetype'] = msg['hasMimeType'] return_values["mimetype"] = msg["hasMimeType"]
if return_values['mimetype'] == 'image/jpeg': if return_values["mimetype"] == "image/jpeg":
file_extension = 'jpg' file_extension = "jpg"
elif return_values['mimetype'] == 'image/png': elif return_values["mimetype"] == "image/png":
file_extension = 'png' file_extension = "png"
elif return_values['mimetype'] == 'image/jp2': elif return_values["mimetype"] == "image/jp2":
file_extension = 'jp2' file_extension = "jp2"
else: else:
file_extension = '' file_extension = ""
logging.warning('No valid mimetype found!') logging.warning("No valid mimetype found!")
else: else:
file_extension = '' file_extension = ""
logging.warning('No valid mimetype found!') logging.warning("No valid mimetype found!")
return_values['uri'] = 'file:///data/{}-poster.{}'.format( return_values["uri"] = "file:///data/{}-poster.{}".format(
msg['@id'].split('/')[-2], file_extension msg["@id"].split("/")[-2], file_extension
) )
return return_values return return_values
...@@ -122,57 +122,57 @@ def _extract_dig_obj_vals(msg, access_status) -> dict: ...@@ -122,57 +122,57 @@ def _extract_dig_obj_vals(msg, access_status) -> dict:
""" """
Extract information on digital object from JSON object Extract information on digital object from JSON object
""" """
if 'isDistributedOn' not in msg: if "isDistributedOn" not in msg:
logging.warning("No isDistributedOn property found in object") logging.warning("No isDistributedOn property found in object")
return dict() return dict()
file_extension = '' file_extension = ""
return_values = {'access': access_status, 'sig': msg['@id'].split('/')[-1]} return_values = {"access": access_status, "sig": msg["@id"].split("/")[-1]}
if 'height' in msg: if "height" in msg:
height = _normalize_dimension(msg['height']) height = _normalize_dimension(msg["height"])
return_values['height'] = height return_values["height"] = height
if 'width' in msg: if "width" in msg:
width = _normalize_dimension(msg['width']) width = _normalize_dimension(msg["width"])
return_values['width'] = width return_values["width"] = width
if 'duration' in msg: if "duration" in msg:
duration = _normalize_duration(msg['duration']) duration = _normalize_duration(msg["duration"])
return_values['duration'] = duration return_values["duration"] = duration
return_values['type'] = msg['isDistributedOn'] return_values["type"] = msg["isDistributedOn"]
if 'hasMimeType' in msg: if "hasMimeType" in msg:
mimetype = msg['hasMimeType'] mimetype = msg["hasMimeType"]
return_values['mimetype'] = mimetype return_values["mimetype"] = mimetype
if _is_remote_file(msg): if _is_remote_file(msg):
return_values['uri'] = msg['locator'] return_values["uri"] = msg["locator"]
if access_status == 'public' and not return_values['type'] == 'image': if access_status == "public" and not return_values["type"] == "image":
# Remote images are always accessed via proxy because their respective # Remote images are always accessed via proxy because their respective
# route goes over the internal image server # route goes over the internal image server
return_values['proto'] = 'redirect' return_values["proto"] = "redirect"
else: else:
if msg.get('proxyType'): if msg.get("proxyType"):
return_values['proto'] = msg['proxyType'] return_values["proto"] = msg["proxyType"]
else: else:
return_values['proto'] = 'proxydirect' return_values["proto"] = "proxydirect"
else: else:
return_values['proto'] = 'file' return_values["proto"] = "file"
if return_values['type'] == 'image': if return_values["type"] == "image":
if return_values.get('mimetype'): if return_values.get("mimetype"):
if return_values['mimetype'] == 'image/jpeg': if return_values["mimetype"] == "image/jpeg":
file_extension = 'jpg' file_extension = "jpg"
elif return_values['mimetype'] == 'image/png': elif return_values["mimetype"] == "image/png":
file_extension = 'png' file_extension = "png"
elif return_values['mimetype'] == 'image/jp2': elif return_values["mimetype"] == "image/jp2":
file_extension = 'jp2' file_extension = "jp2"
else: else:
file_extension = '' file_extension = ""
logging.warning('No valid mimetype found!') logging.warning("No valid mimetype found!")
else: else:
file_extension = '' file_extension = ""
logging.warning('No valid mimetype found!') logging.warning("No valid mimetype found!")
if return_values['type'] == 'audio': if return_values["type"] == "audio":
file_extension = 'mp4' file_extension = "mp4"
if return_values['type'] == 'video': if return_values["type"] == "video":
file_extension = 'mp4' file_extension = "mp4"
return_values['uri'] = ( return_values["uri"] = (
os.environ['URI_BASE'] + return_values['sig'] + '.' + file_extension os.environ["URI_BASE"] + return_values["sig"] + "." + file_extension
) )
return return_values return return_values
...@@ -182,21 +182,21 @@ def _create_audio_snippet_entry(record, access_status) -> dict: ...@@ -182,21 +182,21 @@ def _create_audio_snippet_entry(record, access_status) -> dict:
Create an audio snippet entry based on the digital object Create an audio snippet entry based on the digital object
""" """
snippet_record = record.copy() snippet_record = record.copy()
if 'duration' not in snippet_record: if "duration" not in snippet_record:
logging.warning("No duration for audio found: Setting duration to 0") logging.warning("No duration for audio found: Setting duration to 0")
snippet_record['duration'] = 0 snippet_record["duration"] = 0
snippet_record['sig'] = snippet_record['sig'] + '-intro' snippet_record["sig"] = snippet_record["sig"] + "-intro"
snippet_record['access'] = access_status snippet_record["access"] = access_status
# //@formatter:off # //@formatter:off
snippet_record['duration'] = ( snippet_record["duration"] = (
30 30
if _normalize_duration(snippet_record['duration']) >= 30 if _normalize_duration(snippet_record["duration"]) >= 30
else _normalize_duration(snippet_record['duration']) else _normalize_duration(snippet_record["duration"])
) )
# //@formatter:on # //@formatter:on
snippet_record['mimetype'] = 'audio/mpeg' snippet_record["mimetype"] = "audio/mpeg"
snippet_record['uri'] = ( snippet_record["uri"] = (
'.'.join(snippet_record['uri'].split('.')[0:-1]) + '-intro.mp3' ".".join(snippet_record["uri"].split(".")[0:-1]) + "-intro.mp3"
) )
return snippet_record return snippet_record
...@@ -205,7 +205,7 @@ def _is_remote_file(msg) -> bool: ...@@ -205,7 +205,7 @@ def _is_remote_file(msg) -> bool:
""" """
Media file is saved on a remote system Media file is saved on a remote system
""" """
return 'locator' in msg and not msg['locator'].startswith('sftp:/') return "locator" in msg and not msg["locator"].startswith("sftp:/")
def _get_access_status(graph, record_id) -> str: def _get_access_status(graph, record_id) -> str:
...@@ -218,55 +218,55 @@ def _get_access_status(graph, record_id) -> str: ...@@ -218,55 +218,55 @@ def _get_access_status(graph, record_id) -> str:
access_flags = list() access_flags = list()
for resource in graph: for resource in graph:
if ( if (
'type' in resource "type" in resource
and resource['type'] == 'access' and resource["type"] == "access"
and 'regulates' in resource and "regulates" in resource
and resource['regulates'].startswith('https://memobase.ch/digital/') and resource["regulates"].startswith("https://memobase.ch/digital/")
and 'name' in resource and "name" in resource
): ):
if resource['name'] == 'public': if resource["name"] == "public":
access_flags.append('public') access_flags.append("public")
elif resource['name'] == 'private': elif resource["name"] == "private":
logging.debug( logging.debug(
f'{record_id}: Setting access for digital object to `closed`' f"{record_id}: Setting access for digital object to `closed`"
) )
access_flags.append('closed') access_flags.append("closed")
else: else:
logging.info( logging.info(
f'Digital object of record {record_id} has access type ' f"Digital object of record {record_id} has access type "
+ f'`{resource["name"]}`. This makes the media resource unavailable.' + f'`{resource["name"]}`. This makes the media resource unavailable.'
) )
access_flags.append(resource['name']) access_flags.append(resource["name"])
if 'closed' in access_flags: if "closed" in access_flags:
logging.debug(f'{record_id}: Setting access for digital object to `closed`') logging.debug(f"{record_id}: Setting access for digital object to `closed`")
return 'closed' return "closed"
elif 'public' in access_flags: elif "public" in access_flags:
logging.debug(f'{record_id}: Setting access for digital object to `public`') logging.debug(f"{record_id}: Setting access for digital object to `public`")
return 'public' return "public"
elif 'faro' in access_flags: elif "faro" in access_flags:
logging.info( logging.info(
f'Digital object of record {record_id} has access type `faro`.' f"Digital object of record {record_id} has access type `faro`."
+ ' This makes the media resource unavailable.' + " This makes the media resource unavailable."
) )
return 'faro' return "faro"
elif 'onsite' in access_flags: elif "onsite" in access_flags:
logging.info( logging.info(
f'Digital object of record {record_id} has access type `onsite`.' f"Digital object of record {record_id} has access type `onsite`."
+ ' This makes the media resource unavailable.' + " This makes the media resource unavailable."
) )
return 'onsite' return "onsite"
elif 'noonsite' in access_flags: elif "noonsite" in access_flags:
logging.info( logging.info(
f'Digital object of record {record_id} has access type `noonsite`.' f"Digital object of record {record_id} has access type `noonsite`."
+ ' This makes the media resource unavailable.' + " This makes the media resource unavailable."
) )
return 'noonsite' return "noonsite"
else: else:
logging.warning( logging.warning(
f'Digital object of record {record_id} has no or invalid access information!' f"Digital object of record {record_id} has no or invalid access information!"
+ ' The media resource is therefore unavailable' + " The media resource is therefore unavailable"
) )
return 'unavailable' return "unavailable"
def _get_record_id(graph) -> Optional[str]: def _get_record_id(graph) -> Optional[str]:
...@@ -275,10 +275,10 @@ def _get_record_id(graph) -> Optional[str]: ...@@ -275,10 +275,10 @@ def _get_record_id(graph) -> Optional[str]:
""" """
for resource in graph: for resource in graph:
if ( if (
'@type' in resource "@type" in resource
and resource['@type'] == 'https://www.ica.org/standards/RiC/ontology#Record' and resource["@type"] == "https://www.ica.org/standards/RiC/ontology#Record"
): ):
return resource['@id'] if '@id' in resource else None return resource["@id"] if "@id" in resource else None
def _has_audio_snippet(record) -> bool: def _has_audio_snippet(record) -> bool:
...@@ -286,9 +286,9 @@ def _has_audio_snippet(record) -> bool: ...@@ -286,9 +286,9 @@ def _has_audio_snippet(record) -> bool:
Record has an attached audio snippet (created by external service) Record has an attached audio snippet (created by external service)
""" """
return ( return (
record['type'] == 'audio' record["type"] == "audio"
and 'uri' in record and "uri" in record
and record['uri'].startswith('file://') and record["uri"].startswith("file://")
) )
...@@ -303,21 +303,21 @@ def _normalize_duration(duration) -> int: ...@@ -303,21 +303,21 @@ def _normalize_duration(duration) -> int:
""" """
Normalise different representation of duration Normalise different representation of duration
""" """
if re.fullmatch(r'\d+:\d{2}', str(duration), re.ASCII): if re.fullmatch(r"\d+:\d{2}", str(duration), re.ASCII):
split = duration.split(':') split = duration.split(":")
return int(split[0]) * 60 + int(split[1]) return int(split[0]) * 60 + int(split[1])
elif re.fullmatch(r'\d+:\d{2}:\d{2}', str(duration), re.ASCII): elif re.fullmatch(r"\d+:\d{2}:\d{2}", str(duration), re.ASCII):
split = duration.split(':') split = duration.split(":")
return int(split[0]) * 3600 + int(split[1]) * 60 + int(split[2]) return int(split[0]) * 3600 + int(split[1]) * 60 + int(split[2])
elif re.fullmatch(r'\d+:\d{2}:\d{2}\d{3}', str(duration), re.ASCII): elif re.fullmatch(r"\d+:\d{2}:\d{2}\d{3}", str(duration), re.ASCII):
split = duration.split(':') split = duration.split(":")
return int(split[0]) * 3600 + int(split[1]) * 60 + int(split[2]) return int(split[0]) * 3600 + int(split[1]) * 60 + int(split[2])
elif re.fullmatch(r'\d+.\d{6}', str(duration), re.ASCII): elif re.fullmatch(r"\d+.\d{6}", str(duration), re.ASCII):
return int(duration.split('.')[0]) return int(duration.split(".")[0])
elif re.fullmatch(r'\d+', str(duration), re.ASCII): elif re.fullmatch(r"\d+", str(duration), re.ASCII):
return int(duration) return int(duration)
else: else:
logging.warning(f'Can\'t parse duration `{duration}`') logging.warning(f"Can't parse duration `{duration}`")
return 0 return 0
...@@ -325,21 +325,21 @@ def _is_digital_object(resource) -> bool: ...@@ -325,21 +325,21 @@ def _is_digital_object(resource) -> bool:
""" """
Resource is of type `digital object` Resource is of type `digital object`
""" """
return 'type' in resource and resource['type'] == 'digitalObject' return "type" in resource and resource["type"] == "digitalObject"
def _is_thumbnail(resource) -> bool: def _is_thumbnail(resource) -> bool:
""" """
Resource is of type `thumbnail` Resource is of type `thumbnail`
""" """
return 'type' in resource and resource['type'] == 'thumbnail' return "type" in resource and resource["type"] == "thumbnail"
def _is_playable(access_status) -> bool: def _is_playable(access_status) -> bool:
""" """
Digital object can be retrieved via link Digital object can be retrieved via link
""" """
return access_status == 'public' or access_status == 'closed' return access_status == "public" or access_status == "closed"
class MediametadataToDB: class MediametadataToDB:
...@@ -357,10 +357,10 @@ class MediametadataToDB: ...@@ -357,10 +357,10 @@ class MediametadataToDB:
consumer.poll() consumer.poll()
for record_object in consumer: for record_object in consumer:
counter += 1 counter += 1
record = record_object.value['@graph'] record = record_object.value["@graph"]
headers = record_object.headers headers = record_object.headers
record_id = _get_record_id(record) record_id = _get_record_id(record)
logging.debug(f'Processing record {record_id}') logging.debug(f"Processing record {record_id}")
record_processor.new_record(record_id, headers) record_processor.new_record(record_id, headers)
access_status = _get_access_status(record, record_id) access_status = _get_access_status(record, record_id)
for record_resource in record: for record_resource in record:
...@@ -391,9 +391,9 @@ class MediametadataToDB: ...@@ -391,9 +391,9 @@ class MediametadataToDB:
record_processor.digital_object_fail( record_processor.digital_object_fail(
record_id, error record_id, error
) )
elif access_status == 'unavailable': elif access_status == "unavailable":
record_processor.digital_object_fail( record_processor.digital_object_fail(
record_id, "invalid" " or missing" "access flag" record_id, "invalid or missing access flag"