Commit c2d800d9 authored by Thomas Bernhart's avatar Thomas Bernhart
Browse files

Remove obsolete old export script

parent c019fb93
#!/usr/bin/env python
# coding: utf-8
import os
import sys
import traceback
import xml.etree.ElementTree as ET
from hashlib import md5
from shutil import copy2
from urllib.parse import quote
class FoxmlParsingError(Exception):
pass
def get_last_element(tree, xpath_expression, namespaces, sort_by_attrib):
elements = tree.findall(xpath_expression, namespaces)
if (len(elements) > 0):
elements.sort(reverse=True, key=lambda elem:elem.attrib[sort_by_attrib])
return elements[0]
else:
return None
def parse_into_fedora_object(tree, namespaces, metadata_datastream_version):
relation_el = metadata_datastream_version.find("foxml:xmlContent/ebucore:ebuCoreMain/ebucore:coreMetadata/ebucore:isMemberOf/ns2:relation", namespaces)
if (relation_el is None or relation_el.text is None):
return None
else:
fedora_object = {}
fedora_object['record_set'] = relation_el.text
identifier_el = metadata_datastream_version.find("foxml:xmlContent/ebucore:ebuCoreMain/ebucore:coreMetadata/ebucore:identifier[@typeLabel='Original']/ns2:identifier", namespaces)
if (identifier_el is not None and identifier_el.text is not None):
fedora_object['document_id'] = identifier_el.text
locator_el = metadata_datastream_version.find("foxml:xmlContent/ebucore:ebuCoreMain/ebucore:coreMetadata/ebucore:format/ebucore:essenceLocator/ebucore:locatorInfo", namespaces)
if (locator_el is not None and locator_el.text is not None):
fedora_object['locator'] = locator_el.text
objecttype_el = metadata_datastream_version.find("foxml:xmlContent/ebucore:ebuCoreMain/ebucore:coreMetadata/ebucore:type/ebucore:objectType", namespaces)
if (objecttype_el is not None and objecttype_el.attrib['typeLabel'] is not None):
fedora_object['object_type'] = objecttype_el.attrib['typeLabel']
else:
fedora_object['object_type'] = "unknown"
thumbnail_datastream_version = get_last_element(tree, ".//foxml:datastream[@ID='THUMBNAIL_0']/foxml:datastreamVersion", namespaces, 'CREATED')
if (thumbnail_datastream_version is not None and thumbnail_datastream_version.attrib['LABEL'] is not None):
fedora_object['thumbnail_original_filename'] = thumbnail_datastream_version.attrib['LABEL']
fedora_object['thumbnail_ref'] = (thumbnail_datastream_version.find("foxml:contentLocation", namespaces)).attrib['REF']
accesscopy_datastream_version = get_last_element(tree, ".//foxml:datastream[@ID='ACCESSCOPY_0']/foxml:datastreamVersion", namespaces, 'CREATED')
if (accesscopy_datastream_version is not None and accesscopy_datastream_version.attrib['LABEL'] is not None):
fedora_object['accesscopy_original_filename'] = accesscopy_datastream_version.attrib['LABEL']
fedora_object['accesscopy_ref'] = (accesscopy_datastream_version.find("foxml:contentLocation", namespaces)).attrib['REF']
return fedora_object
def calculate_data_stream_path(datastream_store_path, datastream_id):
full_id = "info:fedora/" + datastream_id.replace('+', '/')
quoted_full_id = quote(full_id, safe='').replace('_', '%5F')
hash_object = md5((full_id).encode())
return os.path.join(datastream_store_path, hash_object.hexdigest()[0:2], quoted_full_id)
def copy_thumbnail(datastream_store_path, output_directory, fedora_object):
thumbnail_directory = os.path.join(output_directory, fedora_object['record_set'], 'thumbnails')
if not os.path.exists(thumbnail_directory):
os.makedirs(thumbnail_directory)
thumbnail_filename = fedora_object['document_id'] + os.path.splitext(fedora_object['thumbnail_original_filename'])[1]
destination_path = os.path.join(thumbnail_directory, thumbnail_filename)
thumbnail_path = calculate_data_stream_path(datastream_store_path, fedora_object['thumbnail_ref'])
copy2(thumbnail_path, destination_path, follow_symlinks=False)
print("Copied '{}' to '{}'".format(thumbnail_path, destination_path))
def copy_accesscopy(datastream_store_path, output_directory, fedora_object):
accesscopy_directory = os.path.join(output_directory, fedora_object['record_set'], 'media')
if not os.path.exists(accesscopy_directory):
os.makedirs(accesscopy_directory)
accesscopy_filename = fedora_object['document_id'] + os.path.splitext(fedora_object['accesscopy_original_filename'])[1]
destination_path = os.path.join(accesscopy_directory, accesscopy_filename)
accesscopy_path = calculate_data_stream_path(datastream_store_path, fedora_object['accesscopy_ref'])
copy2(accesscopy_path, destination_path, follow_symlinks=False)
print("Copied '{}' to '{}'".format(accesscopy_path, destination_path))
def copy_http_resource(accesscopy_path, output_directory, fedora_object):
accesscopy_directory = os.path.join(output_directory, fedora_object['record_set'], 'media')
if not os.path.exists(accesscopy_directory):
os.makedirs(accesscopy_directory)
accesscopy_filename = fedora_object['document_id'] + os.path.splitext(accesscopy_path)[1]
destination_path = os.path.join(accesscopy_directory, accesscopy_filename)
copy2(accesscopy_path, destination_path, follow_symlinks=False)
print("Copied '{}' to '{}'".format(accesscopy_path, destination_path))
def copy_rtmp_resource(accesscopy_original_filename, rtmp_files_path, output_directory, fedora_object):
accesscopy_directory = os.path.join(output_directory, fedora_object['record_set'], 'media')
if not os.path.exists(accesscopy_directory):
os.makedirs(accesscopy_directory)
accesscopy_filename = fedora_object['document_id'] + os.path.splitext(accesscopy_original_filename)[1]
destination_path = os.path.join(accesscopy_directory, accesscopy_filename)
accesscopy_original_path1 = os.path.join(rtmp_files_path, accesscopy_original_filename)
accesscopy_original_path2 = os.path.join(rtmp_files_path, 'open', accesscopy_original_filename)
if (os.path.isfile(accesscopy_original_path1)):
copy2(accesscopy_original_path1, destination_path, follow_symlinks=False)
print("Copied '{}' to '{}'".format(accesscopy_original_path1, destination_path))
elif (os.path.isfile(accesscopy_original_path2)):
copy2(accesscopy_original_path2, destination_path, follow_symlinks=False)
print("Copied '{}' to '{}'".format(accesscopy_original_path2, destination_path))
else:
raise Exception('rtmp resource not found.')
def parse_foxml(file):
namespaces = {
'foxml': 'info:fedora/fedora-system:def/foxml#',
'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
'dc': 'http://purl.org/dc/elements/1.1/',
'ebucore': 'urn:ebu:metadata-schema:ebuCore_2012',
'ns2': 'http://purl.org/dc/elements/1.1/'
}
tree = ET.parse(file)
metadata_datastream_version = get_last_element(tree, "foxml:datastream[@ID='TRANSFORMED_METADATA_0']/foxml:datastreamVersion[@LABEL='Internal Memobase Metadata']", namespaces, 'CREATED')
if (metadata_datastream_version is None):
return None
else:
return parse_into_fedora_object(tree, namespaces, metadata_datastream_version)
# r=root, d=directories, f = files
objectstore_path = '/mnt/scratch/570_Memoriav/Datenexport/objectStore'
datastreamstore_path = '/mnt/scratch/570_Memoriav/Datenexport/datastreamStore'
http_files_path = '/mnt/scratch/570_Memoriav/Datenexport/public-files'
rtmp_files_path = '/mnt/scratch/570_Memoriav/Datenexport/library'
output_directory = '/mnt/scratch/570_Memoriav/sftp_20201130-2'
if not os.path.exists(output_directory):
os.makedirs(output_directory)
for r, d, f in os.walk(objectstore_path):
for file in f:
foxml_path = os.path.join(r, file)
try:
print("Parsing FOXML file: " + foxml_path)
fedora_object = parse_foxml(foxml_path)
if (fedora_object is None):
print("Ignoring " + foxml_path)
else:
print("Extracting files for: record set: " + fedora_object['record_set'] + ' ; document: ' + fedora_object['document_id'])
print("fedora_object: ")
print(fedora_object)
record_set_path = os.path.join(output_directory, fedora_object['record_set'])
if not os.path.exists(record_set_path):
os.makedirs(record_set_path)
foxml_destination_path = os.path.join(record_set_path, os.path.basename(foxml_path) + ".xml")
copy2(foxml_path, foxml_destination_path, follow_symlinks=False)
if ('thumbnail_ref' in fedora_object) and fedora_object['object_type'] in ['film', 'Film', 'tbs', 'television', 'TV', 'video', 'Video', 'unkown']:
copy_thumbnail(datastreamstore_path, output_directory, fedora_object)
if 'accesscopy_ref' in fedora_object:
copy_accesscopy(datastreamstore_path, output_directory, fedora_object)
elif 'locator' in fedora_object:
# copy streaming resource
locator = fedora_object['locator']
if (locator.startswith('https://memobase.ch/files/')):
accesscopy_path = os.path.join(http_files_path, locator[len('https://memobase.ch/files/'):])
copy_http_resource(accesscopy_path, output_directory, fedora_object)
elif (locator.startswith('rtmp://intstream.memobase.ch:1935/memobase/')):
rtmp_rel_path = locator[len('rtmp://intstream.memobase.ch:1935/memobase/'):]
accesscopy_filename = rtmp_rel_path[rtmp_rel_path.find(':') + 1:]
copy_rtmp_resource(accesscopy_filename, rtmp_files_path, output_directory, fedora_object)
print("Successfully extracted any files")
except: # catch *all* exceptions
traceback.print_exc(limit=1, file=sys.stdout)
print("Finished data extraction")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment