From 692741fc20d01bb69edbb710abc68cfa53c742f0 Mon Sep 17 00:00:00 2001 From: Jonas Waeber <jonaswaeber@gmail.com> Date: Tue, 8 Oct 2019 10:22:41 +0200 Subject: [PATCH] add loop to retry download after 24h delete old files does not repeat download of existing files. --- src/download.py | 59 ++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/src/download.py b/src/download.py index 0cdc39c..97bb6bb 100644 --- a/src/download.py +++ b/src/download.py @@ -5,6 +5,7 @@ import yaml from yaml import BaseLoader import logging import requests +import time from SPARQLWrapper import SPARQLWrapper, JSON if __name__ == '__main__': @@ -21,33 +22,45 @@ if __name__ == '__main__': sparql = SPARQLWrapper(conf['sparql']['endpoint']) - for language in ['de', 'en', 'fr', 'it', 'commons']: - with open(conf['sparql']['files'][language], 'r') as fp: - query = fp.read() + while True: + for language in ['de', 'en', 'fr', 'it', 'commons']: + with open(conf['sparql']['files'][language], 'r') as fp: + query = fp.read() - sparql.setQuery(query) - sparql.setReturnFormat(JSON) - results = sparql.query().convert() - for binding in results['results']['bindings']: - download_link: str = binding['file']['value'] - latest_version: str = binding['latestVersion']['value'].replace('.', '-') - artifact: str = binding['artifact']['value'] - file_name = f'{latest_version}-{download_link.split("/")[-1]}' + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + for binding in results['results']['bindings']: + download_link: str = binding['file']['value'] + file_name_artifact: str = download_link.split('/')[-1] + latest_version: str = binding['latestVersion']['value'].replace('.', '-') + file_name = f'{latest_version}-{file_name_artifact}' - logging.info(f"Download file from {download_link}.") - with requests.get(download_link, stream=True) as response: - if response.ok: - with open(f'/tmp/{file_name}', 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - shutil.move(f'/tmp/{file_name}', f'{conf["output"][language]}/{file_name}') + if os.path.exists(f"{conf['output'][language]}/{file_name}"): + continue + else: + # delete old files of the same artifact. + for root, directories, files in os.walk(conf['output'][language]): + for file in files: + if file_name_artifact in file: + os.remove(os.path.join(root, file)) + + logging.info(f"Download file from {download_link}.") + with requests.get(download_link, stream=True) as response: + if response.ok: + with open(f'/tmp/{file_name}', 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + shutil.move(f'/tmp/{file_name}', f'{conf["output"][language]}/{file_name}') + + logging.info(f"Finished download from {download_link} into {conf['output'][language]}/{file_name}") + + logging.info("Finished download for dbpedia.") + logging.info("Now wait for 24h to check for newer dump!") + time.sleep(86_400) # 24h - logging.info(f"Finished download from {download_link} into {conf['output'][language]}/{file_name}") - logging.info("Finished download for dbpedia.") - - -- GitLab