Skip to content
Snippets Groups Projects
Commit 692741fc authored by Jonas Waeber's avatar Jonas Waeber
Browse files

add loop to retry download after 24h

delete old files
does not repeat download of existing files.
parent 1ed8a15e
No related branches found
Tags 1.1.0
No related merge requests found
Pipeline #58255 passed
......@@ -5,6 +5,7 @@ import yaml
from yaml import BaseLoader
import logging
import requests
import time
from SPARQLWrapper import SPARQLWrapper, JSON
if __name__ == '__main__':
......@@ -21,33 +22,45 @@ if __name__ == '__main__':
sparql = SPARQLWrapper(conf['sparql']['endpoint'])
for language in ['de', 'en', 'fr', 'it', 'commons']:
with open(conf['sparql']['files'][language], 'r') as fp:
query = fp.read()
while True:
for language in ['de', 'en', 'fr', 'it', 'commons']:
with open(conf['sparql']['files'][language], 'r') as fp:
query = fp.read()
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for binding in results['results']['bindings']:
download_link: str = binding['file']['value']
latest_version: str = binding['latestVersion']['value'].replace('.', '-')
artifact: str = binding['artifact']['value']
file_name = f'{latest_version}-{download_link.split("/")[-1]}'
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for binding in results['results']['bindings']:
download_link: str = binding['file']['value']
file_name_artifact: str = download_link.split('/')[-1]
latest_version: str = binding['latestVersion']['value'].replace('.', '-')
file_name = f'{latest_version}-{file_name_artifact}'
logging.info(f"Download file from {download_link}.")
with requests.get(download_link, stream=True) as response:
if response.ok:
with open(f'/tmp/{file_name}', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
shutil.move(f'/tmp/{file_name}', f'{conf["output"][language]}/{file_name}')
if os.path.exists(f"{conf['output'][language]}/{file_name}"):
continue
else:
# delete old files of the same artifact.
for root, directories, files in os.walk(conf['output'][language]):
for file in files:
if file_name_artifact in file:
os.remove(os.path.join(root, file))
logging.info(f"Download file from {download_link}.")
with requests.get(download_link, stream=True) as response:
if response.ok:
with open(f'/tmp/{file_name}', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
shutil.move(f'/tmp/{file_name}', f'{conf["output"][language]}/{file_name}')
logging.info(f"Finished download from {download_link} into {conf['output'][language]}/{file_name}")
logging.info("Finished download for dbpedia.")
logging.info("Now wait for 24h to check for newer dump!")
time.sleep(86_400) # 24h
logging.info(f"Finished download from {download_link} into {conf['output'][language]}/{file_name}")
logging.info("Finished download for dbpedia.")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment