From b11daa3db5c93e2ab4fce827267472d50012190f Mon Sep 17 00:00:00 2001
From: Jonas Waeber <jonas.waeber@unibas.ch>
Date: Thu, 4 Jan 2024 12:16:04 +0100
Subject: [PATCH] add script to generate list of valid wikidata links for
 testing. Fix issue with wikidata parsing and indexing improve performance
 test script

---
 cmd/performance/main.go   | 92 ++++++++++++++++++++++++++++++++++++++-
 pkg/rest/wikidata.go      | 53 ++++++++++++++++++----
 pkg/rest/wikidata_item.go | 89 +++++++++++++++++++++++++++++++++++++
 pkg/service/cache.go      |  3 ++
 pkg/shared/resource.go    | 46 +++++++++++++++++---
 tests/generate_list.go    | 59 +++++++++++++++++++++++++
 6 files changed, 328 insertions(+), 14 deletions(-)
 create mode 100644 tests/generate_list.go

diff --git a/cmd/performance/main.go b/cmd/performance/main.go
index 9757f85..976f03a 100644
--- a/cmd/performance/main.go
+++ b/cmd/performance/main.go
@@ -8,6 +8,7 @@ import (
 	"github.com/go-resty/resty/v2"
 	"log"
 	"os"
+	"sort"
 	"time"
 )
 
@@ -15,7 +16,8 @@ func main() {
 	urlFlag := flag.String("url", "", "URL to test")
 	fileFlag := flag.String("file", "", "File containing URLs to test (one per line)")
 	repeatFlag := flag.Int("repeat", 1, "Number of times to repeat each request")
-	outputFlag := flag.String("output", "results.csv", "Output CSV file for measurements")
+	outputFlag := flag.String("output", "tests/results.csv", "Output CSV file for measurements")
+	statisticsFlag := flag.String("statistics", "tests/statistics.csv", "Output CSV file for statistics")
 	flag.Parse()
 
 	if (*urlFlag == "" && *fileFlag == "") || (*urlFlag != "" && *fileFlag != "") {
@@ -51,6 +53,10 @@ func main() {
 		log.Fatalf("Error while writing CSV header: %v", writeError)
 		return
 	}
+
+	// Variables to store statistics
+	var durations []float64
+
 	client := resty.New()
 
 	for _, url := range urls {
@@ -64,6 +70,7 @@ func main() {
 			}
 
 			duration := time.Since(startTime).Seconds()
+			durations = append(durations, duration)
 
 			writeError2 := writer.Write([]string{url, fmt.Sprint(i), fmt.Sprintf("%.6f", duration)})
 			if writeError2 != nil {
@@ -75,6 +82,89 @@ func main() {
 			fmt.Printf("Request #%d to %s took %.6f seconds. Status: %s\n", i, url, duration, resp.Status())
 		}
 	}
+
+	// Calculate and store statistics
+	storeStatistics(*statisticsFlag, durations)
+}
+
+func storeStatistics(statisticsFile string, durations []float64) {
+	sort.Float64s(durations)
+
+	minimum := durations[0]
+	maximum := durations[len(durations)-1]
+
+	var total float64
+	for _, duration := range durations {
+		total += duration
+	}
+
+	average := total / float64(len(durations))
+
+	var median float64
+	if len(durations)%2 == 0 {
+		median = (durations[len(durations)/2-1] + durations[len(durations)/2]) / 2
+	} else {
+		median = durations[len(durations)/2]
+	}
+
+	// Write statistics to the file
+	writeStatistics(statisticsFile, minimum, maximum, average, median, total)
+}
+
+func writeStatistics(statisticFile string, minimum, maximum, average, median, total float64) {
+	// Check if the file exists
+	fileInfo, err := os.Stat(statisticFile)
+	fileExists := !os.IsNotExist(err)
+
+	var file *os.File
+
+	if !fileExists {
+		file, err = os.Create(statisticFile)
+		if err != nil {
+			fmt.Println("Error creating CSV file for statistics:", err)
+			os.Exit(1)
+		}
+	} else {
+		file, err = os.Open(statisticFile)
+		if err != nil {
+			fmt.Println("Error opening CSV file for statistics:", err)
+			os.Exit(1)
+		}
+	}
+
+	defer func(statistics *os.File) {
+		errFile := statistics.Close()
+		if errFile != nil {
+			log.Fatal("Could not close statistics file properly.")
+			return
+		}
+	}(file)
+
+	statisticsWriter := csv.NewWriter(file)
+	defer statisticsWriter.Flush()
+
+	// If the file doesn't exist or is empty, add the header row
+	if !fileExists || fileInfo.Size() == 0 {
+		header := []string{"min", "max", "avg", "median", "total"}
+		writeError := statisticsWriter.Write(header)
+		if writeError != nil {
+			log.Fatalf("Error while writing CSV header for statistics: %v", writeError)
+			return
+		}
+		statisticsWriter.Flush()
+	}
+
+	writeError := statisticsWriter.Write([]string{fmt.Sprintf("%.6f", minimum),
+		fmt.Sprintf("%.6f", maximum),
+		fmt.Sprintf("%.6f", average),
+		fmt.Sprintf("%.6f", median),
+		fmt.Sprintf("%.6f", total),
+	})
+	if writeError != nil {
+		log.Fatalf("Could not write statistics to CSV: %v.", writeError)
+		return
+	}
+	statisticsWriter.Flush()
 }
 
 func readURLsFromFile(file string) []string {
diff --git a/pkg/rest/wikidata.go b/pkg/rest/wikidata.go
index 789a649..b01c80a 100644
--- a/pkg/rest/wikidata.go
+++ b/pkg/rest/wikidata.go
@@ -127,9 +127,9 @@ func (w WikidataRest) extractDescriptions(item WikidataItem) map[string]string {
 	return descriptions
 }
 
-func (w WikidataRest) extractProperties(item WikidataItem) map[string][]any {
+func (w WikidataRest) extractProperties(item WikidataItem) map[string][]*shared.Data {
 
-	properties := map[string][]any{}
+	properties := map[string][]*shared.Data{}
 label:
 	for statementKey, statementItem := range item.Statements {
 		if len(w.properties) > 0 {
@@ -141,16 +141,53 @@ label:
 		}
 		for _, statement := range statementItem {
 			switch statement.Property.DataType {
-			case "commonsMedia":
+			case "commonsMedia", "geographicshape", "tabular-data":
 				if _, ok := properties[statementKey]; !ok {
-					properties[statementKey] = []any{}
+					properties[statementKey] = []*shared.Data{}
 				}
-				properties[statementKey] = append(properties[statementKey], CommonsUrl+strings.Replace(statement.Value.Content.(string), " ", SpaceReplacement, -1))
-			default:
+				properties[statementKey] = append(properties[statementKey], &shared.Data{
+					Simple: CommonsUrl + strings.Replace(statement.Value.Content.(string), " ", SpaceReplacement, -1),
+				})
+			case "globecoordinate":
+				if _, ok := properties[statementKey]; !ok {
+					properties[statementKey] = []*shared.Data{}
+				}
+				properties[statementKey] = append(properties[statementKey], &shared.Data{
+					GlobeCoordinate: statement.Value.GetGlobeCoordinate(),
+				})
+
+			case "monolingualtext":
+				if _, ok := properties[statementKey]; !ok {
+					properties[statementKey] = []*shared.Data{}
+				}
+				properties[statementKey] = append(properties[statementKey], &shared.Data{
+					MonoLingualText: statement.Value.GetMonolingualText(),
+				})
+
+			case "quantity":
 				if _, ok := properties[statementKey]; !ok {
-					properties[statementKey] = []any{}
+					properties[statementKey] = []*shared.Data{}
 				}
-				properties[statementKey] = append(properties[statementKey], statement.Value.Content)
+				properties[statementKey] = append(properties[statementKey], &shared.Data{
+					Quantity: statement.Value.GetQuantity(),
+				})
+			case "time":
+				if _, ok := properties[statementKey]; !ok {
+					properties[statementKey] = []*shared.Data{}
+				}
+				properties[statementKey] = append(properties[statementKey], &shared.Data{
+					Time: statement.Value.GetTime(),
+				})
+			case "wikibase-item", "external-id", "url", "string", "musicalnote", "mathematical-expression":
+				if _, ok := properties[statementKey]; !ok {
+					properties[statementKey] = []*shared.Data{}
+				}
+				properties[statementKey] = append(properties[statementKey], &shared.Data{
+					Simple: statement.Value.Content.(string),
+				})
+
+			default:
+				w.logger.Error().Msgf("Found type that is not handled: %s", statement.Property.DataType)
 			}
 		}
 	}
diff --git a/pkg/rest/wikidata_item.go b/pkg/rest/wikidata_item.go
index cf7dc03..13e65a3 100644
--- a/pkg/rest/wikidata_item.go
+++ b/pkg/rest/wikidata_item.go
@@ -1,5 +1,7 @@
 package rest
 
+import "gitlab.switch.ch/ub-unibas/wikidata-service/pkg/shared"
+
 type WikidataItem struct {
 	ID           string                 `json:"id" example:"Q42"`
 	Type         string                 `json:"type" example:"item"`
@@ -24,3 +26,90 @@ type StatementValue struct {
 	Type    string      `json:"type" example:"value"`
 	Content interface{} `json:"content"`
 }
+
+func (v StatementValue) GetTime() *shared.Time {
+	content, ok := v.Content.(map[string]interface{})
+	if !ok {
+		return &shared.Time{}
+	}
+
+	timeStr, timeOK := content["time"].(string)
+	timeZone, timeZoneOK := content["timezone"].(string)
+	before, beforeOK := content["before"].(string)
+	after, afterOK := content["after"].(string)
+	precision, precisionOK := content["precision"].(float64)
+	calendarModel, calendarModelOK := content["calendarmodel"].(string)
+
+	return &shared.Time{
+		Time:          checkOk(timeStr, timeOK),
+		TimeZone:      checkOk(timeZone, timeZoneOK),
+		Before:        checkOk(before, beforeOK),
+		After:         checkOk(after, afterOK),
+		Precision:     checkOkPrecision(precision, precisionOK),
+		CalendarModel: checkOk(calendarModel, calendarModelOK),
+	}
+}
+
+func checkOk(value string, ok bool) string {
+	if ok {
+		return value
+	} else {
+		return ""
+	}
+}
+
+func checkOkPrecision(value float64, ok bool) float64 {
+	if ok {
+		return value
+
+	} else {
+		return 0
+	}
+}
+
+func (v StatementValue) GetQuantity() *shared.Quantity {
+	content, ok := v.Content.(map[string]interface{})
+	if !ok {
+		return &shared.Quantity{}
+	}
+
+	amount, _ := content["amount"].(string)
+	unit, _ := content["unit"].(string)
+	upperbound, upperboundOK := content["upperbound"].(string)
+	lowerbound, lowerboundOK := content["lowerbound"].(string)
+	if !upperboundOK {
+		upperbound = ""
+	}
+	if !lowerboundOK {
+		lowerbound = ""
+	}
+
+	return &shared.Quantity{
+		Amount:     amount,
+		Unit:       unit,
+		Upperbound: upperbound,
+		Lowerbound: lowerbound,
+	}
+}
+
+func (v StatementValue) GetMonolingualText() *shared.MonoLingualText {
+	content := v.Content.(map[string]interface{})
+	return &shared.MonoLingualText{
+		Text:     content["text"].(string),
+		Language: content["language"].(string),
+	}
+}
+
+func (v StatementValue) GetGlobeCoordinate() *shared.GlobeCoordinate {
+	content := v.Content.(map[string]interface{})
+	lat, latOk := content["latitude"].(string)
+	lon, lonOk := content["longitude"].(string)
+	globe, globeOk := content["globe"].(string)
+	prec, precOk := content["precision"].(float64)
+	return &shared.GlobeCoordinate{
+		Latitude:  checkOk(lat, latOk),
+		Longitude: checkOk(lon, lonOk),
+		Globe:     checkOk(globe, globeOk),
+		Precision: checkOkPrecision(prec, precOk),
+	}
+}
diff --git a/pkg/service/cache.go b/pkg/service/cache.go
index 0b327c4..c8782f8 100644
--- a/pkg/service/cache.go
+++ b/pkg/service/cache.go
@@ -43,6 +43,9 @@ func (cache *WikidataCache) Get(key string) (*shared.Resource, error) {
 		}
 		return nil
 	})
+	if err != nil {
+		return nil, err
+	}
 	var resource *shared.Resource
 	unmarshalError := json.Unmarshal(extractedValue, resource)
 	if unmarshalError != nil {
diff --git a/pkg/shared/resource.go b/pkg/shared/resource.go
index 851f2d5..6ceceac 100644
--- a/pkg/shared/resource.go
+++ b/pkg/shared/resource.go
@@ -9,11 +9,47 @@ import (
 
 // Resource is a generic representation of a Wikidata resource.
 type Resource struct {
-	ID           string            `json:"id"`
-	Labels       map[string]string `json:"labels"`
-	Descriptions map[string]string `json:"descriptions"`
-	Data         map[string][]any  `json:"data"`
-	LastModified string            `json:"lastModified"`
+	ID           string             `json:"id"`
+	Labels       map[string]string  `json:"labels,omitempty"`
+	Descriptions map[string]string  `json:"descriptions,omitempty"`
+	Data         map[string][]*Data `json:"data,omitempty"`
+	LastModified string             `json:"lastModified,omitempty"`
+}
+
+type Data struct {
+	Simple          string           `json:"simple,omitempty"`
+	Time            *Time            `json:"time,omitempty"`
+	Quantity        *Quantity        `json:"quantity,omitempty"`
+	GlobeCoordinate *GlobeCoordinate `json:"globeCoordinate,omitempty"`
+	MonoLingualText *MonoLingualText `json:"monoLingualText,omitempty"`
+}
+
+type GlobeCoordinate struct {
+	Latitude  string  `json:"latitude,omitempty"`
+	Longitude string  `json:"longitude,omitempty"`
+	Globe     string  `json:"globe,omitempty"`
+	Precision float64 `json:"precision,omitempty"`
+}
+
+type Time struct {
+	Time          string  `json:"time,omitempty"`
+	TimeZone      string  `json:"timezone,omitempty"`
+	Before        string  `json:"before,omitempty"`
+	After         string  `json:"after,omitempty"`
+	Precision     float64 `json:"precision,omitempty"`
+	CalendarModel string  `json:"calendarmodel,omitempty"`
+}
+
+type Quantity struct {
+	Amount     string `json:"amount,omitempty"`
+	Unit       string `json:"unit,omitempty"`
+	Upperbound string `json:"upperbound,omitempty"`
+	Lowerbound string `json:"lowerbound,omitempty"`
+}
+
+type MonoLingualText struct {
+	Text     string `json:"value,omitempty"`
+	Language string `json:"language,omitempty"`
 }
 
 // ToReader converts the Resource to an io.Reader.
diff --git a/tests/generate_list.go b/tests/generate_list.go
new file mode 100644
index 0000000..281c6e7
--- /dev/null
+++ b/tests/generate_list.go
@@ -0,0 +1,59 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+)
+
+func main() {
+	// Create or open a file to write the requests
+	file, err := os.Create("tests/wikidata_requests.txt")
+	if err != nil {
+		fmt.Printf("Error creating file: %v\n", err)
+		return
+	}
+	defer file.Close()
+
+	for i := 0; i <= 100000; i++ {
+		identifier := fmt.Sprintf("Q%d", i)
+
+		if exists, err := checkWikidataIdentifier(identifier); err != nil {
+			fmt.Printf("Error checking identifier %s: %v\n", identifier, err)
+		} else if exists {
+			fmt.Printf("Identifier %s exists on Wikidata.\n", identifier)
+			_, err := file.WriteString(fmt.Sprintf("https://wikidata.ub-dd-prod.k8s.unibas.ch/api/v1/wikidata/%s\n", identifier))
+			if err != nil {
+				return
+			}
+		} else {
+			fmt.Printf("Identifier %s does not exist on Wikidata.\n", identifier)
+		}
+	}
+}
+
+func checkWikidataIdentifier(identifier string) (bool, error) {
+	apiEndpoint := "https://wikidata.org/w/rest.php/wikibase/v0/entities/items/" + identifier
+	resp, err := http.Get(apiEndpoint)
+	if err != nil {
+		return false, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode == http.StatusOK {
+		var response map[string]interface{}
+		if err := decodeJSON(resp.Body, &response); err != nil {
+			return false, err
+		}
+		return true, nil
+	}
+
+	return false, fmt.Errorf("wikidata API returned status: %s", resp.Status)
+}
+
+func decodeJSON(reader io.Reader, target interface{}) error {
+	decoder := json.NewDecoder(reader)
+	return decoder.Decode(target)
+}
-- 
GitLab