Commit be1e51cd authored by Thomas Bernhart's avatar Thomas Bernhart
Browse files

Resolve public files through sitemap as well

parent 879e8de7
import logging
from glob import glob
from os import listdir, makedirs, path, walk
from os import makedirs, path, walk
from shutil import copy2
from xml.dom.minidom import parse
from ch.memobase.media import MediaFileSearcher
from ch.memobase.records_sets import RecordSetIdMapper
from ch.memobase.foxml import FoxmlReader
......@@ -68,6 +69,16 @@ def _create_logger(name, logfile):
return logger
def _parse_sitemap(sitemap_file: str) -> dict:
sitemap = dict()
dom = parse(sitemap_file)
for url in dom.getElementsByTagName('url'):
id = url.getElementsByTagName('originalId')[0].firstChild.data
locator = url.getElementsByTagName('loc')[0].firstChild.data
sitemap[id] = locator
return sitemap
def foxml_export(objectstore_path, output_path, recordsets_csv_file):
if not path.exists(output_path):
makedirs(output_path)
......@@ -86,7 +97,7 @@ def foxml_export(objectstore_path, output_path, recordsets_csv_file):
except FoxmlParsingError as parsing_error:
logger.error("Error while parsing FOXML file", exc_info=parsing_error)
else:
old_record_set_id = foxml_reader.get_recordset_id()
old_record_set_id = foxml_reader.get_recordset_identifier()
if old_record_set_id is not None:
logger.debug("FOXML file " + foxml_path + " belongs to record set " + old_record_set_id)
......@@ -98,6 +109,8 @@ def foxml_export(objectstore_path, output_path, recordsets_csv_file):
foxml_destination_path = path.join(record_set_export_path, path.basename(foxml_path) + ".xml")
copy2(foxml_path, foxml_destination_path, follow_symlinks=False)
logger.info("Exported FOXML file '" + foxml_path + "' to '" + foxml_destination_path + "'")
# TODO: Add locator from sitemap if specified
else:
logger.warning("Ignored FOXML file '" + foxml_path +
"': Old recordset ID not listed in 'record_sets_ids.csv'")
......@@ -108,16 +121,17 @@ def foxml_export(objectstore_path, output_path, recordsets_csv_file):
print("Finished FOXML export")
def media_export(record_set_path, datastreamstore_path, http_files_path, rtmp_files_path):
def media_export(record_set_path, datastreamstore_path, http_files_path, rtmp_files_path, sitemap_file):
logger = _create_logger("media_export", path.join(record_set_path, "media_export.log"))
for foxml_path in glob(path.join(record_set_path, "*.xml")):
logger.debug("Exporting media files for file: '" + foxml_path + "'")
foxml_reader = FoxmlReader(foxml_path)
document_id = foxml_reader.get_document_id()
document_id = foxml_reader.get_main_identifier()
media_file_searcher = MediaFileSearcher(foxml_reader, datastreamstore_path, http_files_path, rtmp_files_path)
media_file_searcher = MediaFileSearcher(foxml_reader, datastreamstore_path, http_files_path, rtmp_files_path,
_parse_sitemap(sitemap_file))
accesscopy_file = media_file_searcher.search_media_file()
if accesscopy_file is not None:
......
......@@ -80,16 +80,21 @@ class FoxmlReader:
else:
raise FoxmlParsingError("FOXML file '" + file + "' has no datastream with ID 'TRANSFORMED_METADATA_0'")
def get_recordset_id(self):
def get_recordset_identifier(self):
return _get_element_text(
self.metadata_datastream_element,
"foxml:xmlContent/ebucore:ebuCoreMain/ebucore:coreMetadata/ebucore:isMemberOf/ns2:relation")
def get_document_id(self):
def get_main_identifier(self):
return _get_element_text(
self.metadata_datastream_element,
"foxml:xmlContent/ebucore:ebuCoreMain/ebucore:coreMetadata/ebucore:identifier[@typeLabel='Main']/ns2:identifier")
def get_original_identifier(self):
return _get_element_text(
self.metadata_datastream_element,
"foxml:xmlContent/ebucore:ebuCoreMain/ebucore:coreMetadata/ebucore:identifier[@typeLabel='Original']/ns2:identifier")
def get_locator(self):
return _get_element_text(
self.metadata_datastream_element,
......
......@@ -4,11 +4,12 @@ from ch.memobase.foxml import FoxmlReader
class MediaFileSearcher:
def __init__(self, foxml_reader: FoxmlReader, datastream_store_path, http_files_path, rtmp_files_path):
def __init__(self, foxml_reader: FoxmlReader, datastream_store_path, http_files_path, rtmp_files_path, sitemap):
self.foxml_reader = foxml_reader
self.datastream_store_path = datastream_store_path
self.http_files_path = http_files_path
self.rtmp_files_path = rtmp_files_path
self.sitemap = sitemap
def search_media_file(self):
accesscopy_file = self.foxml_reader.get_accesscopy_datastream_file()
......@@ -18,7 +19,7 @@ class MediaFileSearcher:
elif locator is not None:
return self.__get_file_from_locator(locator)
else:
return None
return self.__get_file_from_sitemap(self.foxml_reader.get_original_identifier())
def search_thumbnail_file(self):
thumbnail_file = self.foxml_reader.get_thumbnail_datastream_file()
......@@ -40,15 +41,15 @@ class MediaFileSearcher:
else:
return None
def __get_http_resource_file(self, locator):
http_resource_path = path.join(self.http_files_path, locator[len('https://memobase.ch/files/'):])
def __get_http_resource_file(self, url):
http_resource_path = path.join(self.http_files_path, url[len('https://memobase.ch/files/'):])
if path.isfile(http_resource_path):
return http_resource_path, path.basename(http_resource_path)
else:
return None
def __get_rtmp_resource_file(self, locator):
rtmp_rel_path = locator[len('rtmp://intstream.memobase.ch:1935/memobase/'):]
def __get_rtmp_resource_file(self, url):
rtmp_rel_path = url[len('rtmp://intstream.memobase.ch:1935/memobase/'):]
source_filename = rtmp_rel_path[rtmp_rel_path.find(':') + 1:]
source_path1 = path.join(self.rtmp_files_path, source_filename)
source_path2 = path.join(self.rtmp_files_path, 'open', source_filename)
......@@ -59,3 +60,10 @@ class MediaFileSearcher:
return source_path2, path.basename(source_path2)
else:
return None
def __get_file_from_sitemap(self, original_document_id):
url = self.sitemap.get(original_document_id)
if url is not None:
return self.__get_http_resource_file(url)
else:
return None
......@@ -12,6 +12,7 @@ arg_handler.add_argument("--object-store-directory", required=True)
arg_handler.add_argument("--datastream-store-directory", required=True)
arg_handler.add_argument("--http-resources-directory", required=True)
arg_handler.add_argument("--rtmp-resources-directory", required=True)
arg_handler.add_argument("--sitemap-file", required=True)
arg_handler.add_argument("--output-directory", required=True)
......@@ -23,4 +24,4 @@ for record_set_directory in listdir(args.output_directory):
record_set_path = path.join(args.output_directory, record_set_directory)
if path.isdir(record_set_path):
media_export(record_set_path, args.datastream_store_directory,
args.http_resources_directory, args.rtmp_resources_directory)
args.http_resources_directory, args.rtmp_resources_directory, args.sitemap_file)
......@@ -9,8 +9,9 @@ arg_handler.add_argument("--record-set-directory", required=True)
arg_handler.add_argument("--datastream-store-directory", required=True)
arg_handler.add_argument("--http-resources-directory", required=True)
arg_handler.add_argument("--rtmp-resources-directory", required=True)
arg_handler.add_argument("--sitemap-file", required=True)
args = arg_handler.parse_args()
media_export(args.record_set_directory, args.datastream_store_directory,
args.http_resources_directory, args.rtmp_resources_directory)
args.http_resources_directory, args.rtmp_resources_directory, args.sitemap_file)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment