From b11daa3db5c93e2ab4fce827267472d50012190f Mon Sep 17 00:00:00 2001 From: Jonas Waeber <jonas.waeber@unibas.ch> Date: Thu, 4 Jan 2024 12:16:04 +0100 Subject: [PATCH] add script to generate list of valid wikidata links for testing. Fix issue with wikidata parsing and indexing improve performance test script --- cmd/performance/main.go | 92 ++++++++++++++++++++++++++++++++++++++- pkg/rest/wikidata.go | 53 ++++++++++++++++++---- pkg/rest/wikidata_item.go | 89 +++++++++++++++++++++++++++++++++++++ pkg/service/cache.go | 3 ++ pkg/shared/resource.go | 46 +++++++++++++++++--- tests/generate_list.go | 59 +++++++++++++++++++++++++ 6 files changed, 328 insertions(+), 14 deletions(-) create mode 100644 tests/generate_list.go diff --git a/cmd/performance/main.go b/cmd/performance/main.go index 9757f85..976f03a 100644 --- a/cmd/performance/main.go +++ b/cmd/performance/main.go @@ -8,6 +8,7 @@ import ( "github.com/go-resty/resty/v2" "log" "os" + "sort" "time" ) @@ -15,7 +16,8 @@ func main() { urlFlag := flag.String("url", "", "URL to test") fileFlag := flag.String("file", "", "File containing URLs to test (one per line)") repeatFlag := flag.Int("repeat", 1, "Number of times to repeat each request") - outputFlag := flag.String("output", "results.csv", "Output CSV file for measurements") + outputFlag := flag.String("output", "tests/results.csv", "Output CSV file for measurements") + statisticsFlag := flag.String("statistics", "tests/statistics.csv", "Output CSV file for statistics") flag.Parse() if (*urlFlag == "" && *fileFlag == "") || (*urlFlag != "" && *fileFlag != "") { @@ -51,6 +53,10 @@ func main() { log.Fatalf("Error while writing CSV header: %v", writeError) return } + + // Variables to store statistics + var durations []float64 + client := resty.New() for _, url := range urls { @@ -64,6 +70,7 @@ func main() { } duration := time.Since(startTime).Seconds() + durations = append(durations, duration) writeError2 := writer.Write([]string{url, fmt.Sprint(i), fmt.Sprintf("%.6f", duration)}) if writeError2 != nil { @@ -75,6 +82,89 @@ func main() { fmt.Printf("Request #%d to %s took %.6f seconds. Status: %s\n", i, url, duration, resp.Status()) } } + + // Calculate and store statistics + storeStatistics(*statisticsFlag, durations) +} + +func storeStatistics(statisticsFile string, durations []float64) { + sort.Float64s(durations) + + minimum := durations[0] + maximum := durations[len(durations)-1] + + var total float64 + for _, duration := range durations { + total += duration + } + + average := total / float64(len(durations)) + + var median float64 + if len(durations)%2 == 0 { + median = (durations[len(durations)/2-1] + durations[len(durations)/2]) / 2 + } else { + median = durations[len(durations)/2] + } + + // Write statistics to the file + writeStatistics(statisticsFile, minimum, maximum, average, median, total) +} + +func writeStatistics(statisticFile string, minimum, maximum, average, median, total float64) { + // Check if the file exists + fileInfo, err := os.Stat(statisticFile) + fileExists := !os.IsNotExist(err) + + var file *os.File + + if !fileExists { + file, err = os.Create(statisticFile) + if err != nil { + fmt.Println("Error creating CSV file for statistics:", err) + os.Exit(1) + } + } else { + file, err = os.Open(statisticFile) + if err != nil { + fmt.Println("Error opening CSV file for statistics:", err) + os.Exit(1) + } + } + + defer func(statistics *os.File) { + errFile := statistics.Close() + if errFile != nil { + log.Fatal("Could not close statistics file properly.") + return + } + }(file) + + statisticsWriter := csv.NewWriter(file) + defer statisticsWriter.Flush() + + // If the file doesn't exist or is empty, add the header row + if !fileExists || fileInfo.Size() == 0 { + header := []string{"min", "max", "avg", "median", "total"} + writeError := statisticsWriter.Write(header) + if writeError != nil { + log.Fatalf("Error while writing CSV header for statistics: %v", writeError) + return + } + statisticsWriter.Flush() + } + + writeError := statisticsWriter.Write([]string{fmt.Sprintf("%.6f", minimum), + fmt.Sprintf("%.6f", maximum), + fmt.Sprintf("%.6f", average), + fmt.Sprintf("%.6f", median), + fmt.Sprintf("%.6f", total), + }) + if writeError != nil { + log.Fatalf("Could not write statistics to CSV: %v.", writeError) + return + } + statisticsWriter.Flush() } func readURLsFromFile(file string) []string { diff --git a/pkg/rest/wikidata.go b/pkg/rest/wikidata.go index 789a649..b01c80a 100644 --- a/pkg/rest/wikidata.go +++ b/pkg/rest/wikidata.go @@ -127,9 +127,9 @@ func (w WikidataRest) extractDescriptions(item WikidataItem) map[string]string { return descriptions } -func (w WikidataRest) extractProperties(item WikidataItem) map[string][]any { +func (w WikidataRest) extractProperties(item WikidataItem) map[string][]*shared.Data { - properties := map[string][]any{} + properties := map[string][]*shared.Data{} label: for statementKey, statementItem := range item.Statements { if len(w.properties) > 0 { @@ -141,16 +141,53 @@ label: } for _, statement := range statementItem { switch statement.Property.DataType { - case "commonsMedia": + case "commonsMedia", "geographicshape", "tabular-data": if _, ok := properties[statementKey]; !ok { - properties[statementKey] = []any{} + properties[statementKey] = []*shared.Data{} } - properties[statementKey] = append(properties[statementKey], CommonsUrl+strings.Replace(statement.Value.Content.(string), " ", SpaceReplacement, -1)) - default: + properties[statementKey] = append(properties[statementKey], &shared.Data{ + Simple: CommonsUrl + strings.Replace(statement.Value.Content.(string), " ", SpaceReplacement, -1), + }) + case "globecoordinate": + if _, ok := properties[statementKey]; !ok { + properties[statementKey] = []*shared.Data{} + } + properties[statementKey] = append(properties[statementKey], &shared.Data{ + GlobeCoordinate: statement.Value.GetGlobeCoordinate(), + }) + + case "monolingualtext": + if _, ok := properties[statementKey]; !ok { + properties[statementKey] = []*shared.Data{} + } + properties[statementKey] = append(properties[statementKey], &shared.Data{ + MonoLingualText: statement.Value.GetMonolingualText(), + }) + + case "quantity": if _, ok := properties[statementKey]; !ok { - properties[statementKey] = []any{} + properties[statementKey] = []*shared.Data{} } - properties[statementKey] = append(properties[statementKey], statement.Value.Content) + properties[statementKey] = append(properties[statementKey], &shared.Data{ + Quantity: statement.Value.GetQuantity(), + }) + case "time": + if _, ok := properties[statementKey]; !ok { + properties[statementKey] = []*shared.Data{} + } + properties[statementKey] = append(properties[statementKey], &shared.Data{ + Time: statement.Value.GetTime(), + }) + case "wikibase-item", "external-id", "url", "string", "musicalnote", "mathematical-expression": + if _, ok := properties[statementKey]; !ok { + properties[statementKey] = []*shared.Data{} + } + properties[statementKey] = append(properties[statementKey], &shared.Data{ + Simple: statement.Value.Content.(string), + }) + + default: + w.logger.Error().Msgf("Found type that is not handled: %s", statement.Property.DataType) } } } diff --git a/pkg/rest/wikidata_item.go b/pkg/rest/wikidata_item.go index cf7dc03..13e65a3 100644 --- a/pkg/rest/wikidata_item.go +++ b/pkg/rest/wikidata_item.go @@ -1,5 +1,7 @@ package rest +import "gitlab.switch.ch/ub-unibas/wikidata-service/pkg/shared" + type WikidataItem struct { ID string `json:"id" example:"Q42"` Type string `json:"type" example:"item"` @@ -24,3 +26,90 @@ type StatementValue struct { Type string `json:"type" example:"value"` Content interface{} `json:"content"` } + +func (v StatementValue) GetTime() *shared.Time { + content, ok := v.Content.(map[string]interface{}) + if !ok { + return &shared.Time{} + } + + timeStr, timeOK := content["time"].(string) + timeZone, timeZoneOK := content["timezone"].(string) + before, beforeOK := content["before"].(string) + after, afterOK := content["after"].(string) + precision, precisionOK := content["precision"].(float64) + calendarModel, calendarModelOK := content["calendarmodel"].(string) + + return &shared.Time{ + Time: checkOk(timeStr, timeOK), + TimeZone: checkOk(timeZone, timeZoneOK), + Before: checkOk(before, beforeOK), + After: checkOk(after, afterOK), + Precision: checkOkPrecision(precision, precisionOK), + CalendarModel: checkOk(calendarModel, calendarModelOK), + } +} + +func checkOk(value string, ok bool) string { + if ok { + return value + } else { + return "" + } +} + +func checkOkPrecision(value float64, ok bool) float64 { + if ok { + return value + + } else { + return 0 + } +} + +func (v StatementValue) GetQuantity() *shared.Quantity { + content, ok := v.Content.(map[string]interface{}) + if !ok { + return &shared.Quantity{} + } + + amount, _ := content["amount"].(string) + unit, _ := content["unit"].(string) + upperbound, upperboundOK := content["upperbound"].(string) + lowerbound, lowerboundOK := content["lowerbound"].(string) + if !upperboundOK { + upperbound = "" + } + if !lowerboundOK { + lowerbound = "" + } + + return &shared.Quantity{ + Amount: amount, + Unit: unit, + Upperbound: upperbound, + Lowerbound: lowerbound, + } +} + +func (v StatementValue) GetMonolingualText() *shared.MonoLingualText { + content := v.Content.(map[string]interface{}) + return &shared.MonoLingualText{ + Text: content["text"].(string), + Language: content["language"].(string), + } +} + +func (v StatementValue) GetGlobeCoordinate() *shared.GlobeCoordinate { + content := v.Content.(map[string]interface{}) + lat, latOk := content["latitude"].(string) + lon, lonOk := content["longitude"].(string) + globe, globeOk := content["globe"].(string) + prec, precOk := content["precision"].(float64) + return &shared.GlobeCoordinate{ + Latitude: checkOk(lat, latOk), + Longitude: checkOk(lon, lonOk), + Globe: checkOk(globe, globeOk), + Precision: checkOkPrecision(prec, precOk), + } +} diff --git a/pkg/service/cache.go b/pkg/service/cache.go index 0b327c4..c8782f8 100644 --- a/pkg/service/cache.go +++ b/pkg/service/cache.go @@ -43,6 +43,9 @@ func (cache *WikidataCache) Get(key string) (*shared.Resource, error) { } return nil }) + if err != nil { + return nil, err + } var resource *shared.Resource unmarshalError := json.Unmarshal(extractedValue, resource) if unmarshalError != nil { diff --git a/pkg/shared/resource.go b/pkg/shared/resource.go index 851f2d5..6ceceac 100644 --- a/pkg/shared/resource.go +++ b/pkg/shared/resource.go @@ -9,11 +9,47 @@ import ( // Resource is a generic representation of a Wikidata resource. type Resource struct { - ID string `json:"id"` - Labels map[string]string `json:"labels"` - Descriptions map[string]string `json:"descriptions"` - Data map[string][]any `json:"data"` - LastModified string `json:"lastModified"` + ID string `json:"id"` + Labels map[string]string `json:"labels,omitempty"` + Descriptions map[string]string `json:"descriptions,omitempty"` + Data map[string][]*Data `json:"data,omitempty"` + LastModified string `json:"lastModified,omitempty"` +} + +type Data struct { + Simple string `json:"simple,omitempty"` + Time *Time `json:"time,omitempty"` + Quantity *Quantity `json:"quantity,omitempty"` + GlobeCoordinate *GlobeCoordinate `json:"globeCoordinate,omitempty"` + MonoLingualText *MonoLingualText `json:"monoLingualText,omitempty"` +} + +type GlobeCoordinate struct { + Latitude string `json:"latitude,omitempty"` + Longitude string `json:"longitude,omitempty"` + Globe string `json:"globe,omitempty"` + Precision float64 `json:"precision,omitempty"` +} + +type Time struct { + Time string `json:"time,omitempty"` + TimeZone string `json:"timezone,omitempty"` + Before string `json:"before,omitempty"` + After string `json:"after,omitempty"` + Precision float64 `json:"precision,omitempty"` + CalendarModel string `json:"calendarmodel,omitempty"` +} + +type Quantity struct { + Amount string `json:"amount,omitempty"` + Unit string `json:"unit,omitempty"` + Upperbound string `json:"upperbound,omitempty"` + Lowerbound string `json:"lowerbound,omitempty"` +} + +type MonoLingualText struct { + Text string `json:"value,omitempty"` + Language string `json:"language,omitempty"` } // ToReader converts the Resource to an io.Reader. diff --git a/tests/generate_list.go b/tests/generate_list.go new file mode 100644 index 0000000..281c6e7 --- /dev/null +++ b/tests/generate_list.go @@ -0,0 +1,59 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "os" +) + +func main() { + // Create or open a file to write the requests + file, err := os.Create("tests/wikidata_requests.txt") + if err != nil { + fmt.Printf("Error creating file: %v\n", err) + return + } + defer file.Close() + + for i := 0; i <= 100000; i++ { + identifier := fmt.Sprintf("Q%d", i) + + if exists, err := checkWikidataIdentifier(identifier); err != nil { + fmt.Printf("Error checking identifier %s: %v\n", identifier, err) + } else if exists { + fmt.Printf("Identifier %s exists on Wikidata.\n", identifier) + _, err := file.WriteString(fmt.Sprintf("https://wikidata.ub-dd-prod.k8s.unibas.ch/api/v1/wikidata/%s\n", identifier)) + if err != nil { + return + } + } else { + fmt.Printf("Identifier %s does not exist on Wikidata.\n", identifier) + } + } +} + +func checkWikidataIdentifier(identifier string) (bool, error) { + apiEndpoint := "https://wikidata.org/w/rest.php/wikibase/v0/entities/items/" + identifier + resp, err := http.Get(apiEndpoint) + if err != nil { + return false, err + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + var response map[string]interface{} + if err := decodeJSON(resp.Body, &response); err != nil { + return false, err + } + return true, nil + } + + return false, fmt.Errorf("wikidata API returned status: %s", resp.Status) +} + +func decodeJSON(reader io.Reader, target interface{}) error { + decoder := json.NewDecoder(reader) + return decoder.Decode(target) +} -- GitLab