From 692741fc20d01bb69edbb710abc68cfa53c742f0 Mon Sep 17 00:00:00 2001
From: Jonas Waeber <jonaswaeber@gmail.com>
Date: Tue, 8 Oct 2019 10:22:41 +0200
Subject: [PATCH] add loop to retry download after 24h delete old files does
 not repeat download of existing files.

---
 src/download.py | 59 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/src/download.py b/src/download.py
index 0cdc39c..97bb6bb 100644
--- a/src/download.py
+++ b/src/download.py
@@ -5,6 +5,7 @@ import yaml
 from yaml import BaseLoader
 import logging
 import requests
+import time
 from SPARQLWrapper import SPARQLWrapper, JSON
 
 if __name__ == '__main__':
@@ -21,33 +22,45 @@ if __name__ == '__main__':
 
     sparql = SPARQLWrapper(conf['sparql']['endpoint'])
 
-    for language in ['de', 'en', 'fr', 'it', 'commons']:
-        with open(conf['sparql']['files'][language], 'r') as fp:
-            query = fp.read()
+    while True:
+        for language in ['de', 'en', 'fr', 'it', 'commons']:
+            with open(conf['sparql']['files'][language], 'r') as fp:
+                query = fp.read()
 
-        sparql.setQuery(query)
-        sparql.setReturnFormat(JSON)
-        results = sparql.query().convert()
-        for binding in results['results']['bindings']:
-            download_link: str = binding['file']['value']
-            latest_version: str = binding['latestVersion']['value'].replace('.', '-')
-            artifact: str = binding['artifact']['value']
-            file_name = f'{latest_version}-{download_link.split("/")[-1]}'
+            sparql.setQuery(query)
+            sparql.setReturnFormat(JSON)
+            results = sparql.query().convert()
+            for binding in results['results']['bindings']:
+                download_link: str = binding['file']['value']
+                file_name_artifact: str = download_link.split('/')[-1]
+                latest_version: str = binding['latestVersion']['value'].replace('.', '-')
+                file_name = f'{latest_version}-{file_name_artifact}'
 
-            logging.info(f"Download file from {download_link}.")
-            with requests.get(download_link, stream=True) as response:
-                if response.ok:
-                    with open(f'/tmp/{file_name}', 'wb') as f:
-                        for chunk in response.iter_content(chunk_size=8192):
-                            if chunk:  # filter out keep-alive new chunks
-                                f.write(chunk)
-                    shutil.move(f'/tmp/{file_name}', f'{conf["output"][language]}/{file_name}')
+                if os.path.exists(f"{conf['output'][language]}/{file_name}"):
+                    continue
+                else:
+                    # delete old files of the same artifact.
+                    for root, directories, files in os.walk(conf['output'][language]):
+                        for file in files:
+                            if file_name_artifact in file:
+                                os.remove(os.path.join(root, file))
+
+                logging.info(f"Download file from {download_link}.")
+                with requests.get(download_link, stream=True) as response:
+                    if response.ok:
+                        with open(f'/tmp/{file_name}', 'wb') as f:
+                            for chunk in response.iter_content(chunk_size=8192):
+                                if chunk:  # filter out keep-alive new chunks
+                                    f.write(chunk)
+                        shutil.move(f'/tmp/{file_name}', f'{conf["output"][language]}/{file_name}')
+
+                logging.info(f"Finished download from {download_link} into {conf['output'][language]}/{file_name}")
+
+        logging.info("Finished download for dbpedia.")
+        logging.info("Now wait for 24h to check for newer dump!")
+        time.sleep(86_400)  # 24h
 
-            logging.info(f"Finished download from {download_link} into {conf['output'][language]}/{file_name}")
 
-    logging.info("Finished download for dbpedia.")
 
 
-      
-    
 
-- 
GitLab