Commit f8348a0e authored by Juergen Enge's avatar Juergen Enge

anpassungen

parent 23739ec8
Pipeline #17158 failed with stages
in 41 seconds
package main
import (
"context"
"database/sql"
"flag"
"fmt"
_ "github.com/go-sql-driver/mysql"
"github.com/op/go-logging"
"github.com/robfig/cron"
"gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server/pkg/memostream"
"gitlab.switch.ch/memoriav/memobase-2020/services/url-checker/pkg/memocrawler"
"log"
"os"
"os/signal"
"path/filepath"
"strings"
"syscall"
"time"
)
......@@ -26,13 +22,14 @@ clear database: update test.`entities` set status="new", errormessage=null,mimet
type cronLogger struct {
log *logging.Logger
}
// Info logs routine messages about cron's operation.
func (cl cronLogger) Info(msg string, keysAndValues ...interface{}) {
str := msg
var name string
for _, val := range keysAndValues {
if name == "" {
name,_ = val.(string)
name, _ = val.(string)
} else {
str += fmt.Sprintf("\n %v: %v", name, val)
name = ""
......@@ -40,15 +37,13 @@ func (cl cronLogger) Info(msg string, keysAndValues ...interface{}) {
}
cl.log.Info(str)
}
// Error logs an error condition.
func (cl cronLogger) Error(err error, msg string, keysAndValues ...interface{}) {
keysAndValues = append(keysAndValues, err)
cl.log.Errorf(msg+": %v", keysAndValues...)
}
func main() {
configFile := flag.String("cfg", "./memocrawler.toml", "config file location")
flag.Parse()
......@@ -121,46 +116,6 @@ func main() {
cl := memocrawler.NewCrawlerLinkcheck(cr, config.Crawler.Timeout.Duration, config.Crawler.HeaderSize)
cr.SetCrawlerLinkcheck(cl)
if config.Crawler.Cron == "" {
cr.Start()
return
}
//c := cron.New(cron.WithLogger(cronLogger{log:log}))
c := cron.New()
c.AddFunc(config.Crawler.Cron, func() {
if err := cr.Start(); err != nil {
log.Errorf( "crawl error: %v", err)
}
})
c.Start()
end := make(chan bool, 1)
// process waiting for interrupt signal (TERM or KILL)
go func() {
sigint := make(chan os.Signal, 1)
// interrupt signal sent from terminal
signal.Notify(sigint, os.Interrupt)
signal.Notify(sigint, syscall.SIGTERM)
signal.Notify(sigint, syscall.SIGKILL)
<-sigint
// We received an interrupt signal, shut down.
log.Infof("shutdown requested")
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
c.Stop()
cr.Shutdown(ctx)
end <- true
}()
<-end
log.Info("server stopped")
cr.Start()
}
......@@ -4,13 +4,8 @@ go 1.14
require (
github.com/BurntSushi/toml v0.3.1
github.com/bluele/gcache v0.0.0-20190518031135-bc40bd653833 // indirect
github.com/dgrijalva/jwt-go v3.2.0+incompatible // indirect
github.com/go-sql-driver/mysql v1.5.0
github.com/goph/emperror v0.17.2
github.com/gorilla/handlers v1.4.2 // indirect
github.com/gorilla/mux v1.7.4 // indirect
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7
github.com/robfig/cron v1.2.0
gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server v0.0.0-20200429072912-1967cd685b73
gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server v0.0.0-20201101123821-07b733154ba4
)
......@@ -11,6 +11,8 @@ github.com/certifi/gocertifi v0.0.0-20190105021004-abcd57078448/go.mod h1:GJKEex
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/felixge/httpsnoop v1.0.1 h1:lvB5Jl89CsZtGIWuTcDM1E/vkVs49/Ml7JJe07l8SPQ=
github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/getsentry/raven-go v0.2.0/go.mod h1:KungGk8q33+aIAZUIVWZDr2OfAEBsO49PX4NzFV5kcQ=
github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs=
......@@ -19,10 +21,10 @@ github.com/gofrs/uuid v3.2.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRx
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/goph/emperror v0.17.2 h1:yLapQcmEsO0ipe9p5TaN22djm3OFV/TfM/fcYP0/J18=
github.com/goph/emperror v0.17.2/go.mod h1:+ZbQ+fUNO/6FNiUo0ujtMjhgad9Xa6fQL9KhH4LNHic=
github.com/gorilla/handlers v1.4.2 h1:0QniY0USkHQ1RGCLfKxeNHK9bkDHGRYGNDFBCS+YARg=
github.com/gorilla/handlers v1.4.2/go.mod h1:Qkdc/uu4tH4g6mTK6auzZ766c4CA0Ng8+o/OAirnOIQ=
github.com/gorilla/mux v1.7.4 h1:VuZ8uybHlWmqV03+zRzdwKL4tUnIp1MAQtp1mIFE1bc=
github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/gorilla/handlers v1.5.1 h1:9lRY6j8DEeeBT10CvO9hGW0gmky0BprnvDI5vfhUHH4=
github.com/gorilla/handlers v1.5.1/go.mod h1:t8XrUpc4KVXb7HGyJ4/cEnwQiaxrX/hz1Zv/4g96P1Q=
github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0/go.mod h1:1NbS8ALrpOvjt0rHPNLyCIeMtbizbir8U//inJ+zuB8=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
......@@ -37,16 +39,12 @@ github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWEr
github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ=
github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k=
github.com/rollbar/rollbar-go v1.0.2/go.mod h1:AcFs5f0I+c71bpHlXNNDbOWJiKwjFDtISeXco0L5PKQ=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server v0.0.0-20200428134555-1b2f17c5d83b h1:unWTj5yUDM1n7h7ri/O1p3EM//O1lFtbE3rHzBxGbMI=
gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server v0.0.0-20200428134555-1b2f17c5d83b/go.mod h1:U/TH5AF3w1u1RhRUMym/wzuILayfStGsqPunzCpv7mw=
gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server v0.0.0-20200429072912-1967cd685b73 h1:gRZDEXWyeOEM/J+gyjtxqWGkDBg5Ort7ml91wTC1eO0=
gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server v0.0.0-20200429072912-1967cd685b73/go.mod h1:LCD6NUeRRlC1M0oOvmrQ6FeDQrrHbJY2tzxEsS0VjWg=
gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server v0.0.0-20201101123821-07b733154ba4 h1:VT0POAj1yFhvNYJk8V4s5AgmTRMEK2dw1gIoeOsvDVI=
gitlab.switch.ch/memoriav/memobase-2020/services/streaming-server v0.0.0-20201101123821-07b733154ba4/go.mod h1:Y1m6dem32X0PPG3KuBw/5vaNQN4iOsrgGwPLmRoP+Z8=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
......
......@@ -11,7 +11,6 @@
package memocrawler
import (
"context"
"database/sql"
"fmt"
"github.com/goph/emperror"
......@@ -60,28 +59,24 @@ func NewCrawler(
mapping *memostream.FileMapper,
log *logging.Logger) *Crawler {
cr := &Crawler{
db: db,
workers: workers,
pageSize: pageSize,
schema: schema,
tempDir: tempDir,
indexer: indexer,
crawlOK: crawlOK,
crawlError: crawlError,
crawlErrorNew: crawlErrorNew,
metaTimeout: metaTimeout,
metaWorkers: metaWorkers,
metaPageSize: metaPageSize,
mapping: mapping,
log: log,
db: db,
workers: workers,
pageSize: pageSize,
schema: schema,
tempDir: tempDir,
indexer: indexer,
crawlOK: crawlOK,
crawlError: crawlError,
crawlErrorNew: crawlErrorNew,
metaTimeout: metaTimeout,
metaWorkers: metaWorkers,
metaPageSize: metaPageSize,
mapping: mapping,
log: log,
}
return cr
}
type Metadata struct {
SFMatches []SFMatches `json:"siegfried"'`
}
func (cr *Crawler) SetCrawlerLinkcheck(cl *CrawlerLinkcheck) {
cr.cl = cl
}
......@@ -118,41 +113,6 @@ func (cr *Crawler) getEntries(sqlstr string, args ...interface{}) ([]*memostream
return entries, nil
}
func (cr *Crawler) MetaNew() error {
cr.log.Infof("start crawling metadata for new entities")
sqlstr := fmt.Sprintf("SELECT e.sig AS signature, e.uri, e.access, e.proto AS protocol, e.`status` "+
"FROM %s.entities e LEFT JOIN %s.metadata m ON e.sig=m.sig "+
"WHERE e.status=? AND m.modificationtime IS NULL "+
"ORDER BY e.creationtime ASC", cr.schema, cr.schema)
for {
entries, err := cr.getEntries(sqlstr, "ok")
if err != nil {
return emperror.Wrapf(err, "cannot get new entries")
}
if len(entries) == 0 {
break
}
for _, entry := range entries {
cr.metaJobQueue.AddBack(&Job{
ID: entry.Signature,
Type: JobType_Metadata,
cr: cr,
entry: entry,
})
}
// wait until last worker ist done
for {
if cr.metaJobQueue.isIdle() {
break
}
time.Sleep(1 * time.Second)
}
}
return nil
}
func (cr *Crawler) CrawlNew() error {
cr.log.Infof("start crawling new entities")
......@@ -292,7 +252,3 @@ func (cr *Crawler) Start() error {
}
return nil
}
func (cr *Crawler) Shutdown(ctx context.Context) {
}
// This file is part of Memobase Crawler which is released under GPLv3.
// See file license.txt for full license details.
//
// Author Juergen Enge <juergen@info-age.net>
//
// This code uses elements from
// * "Mediaserver" (Center for Digital Matter HGK FHNW, Basel)
// * "Remote Exhibition Project" (info-age GmbH, Basel)
//
package memocrawler
import (
"encoding/json"
"github.com/goph/emperror"
"io/ioutil"
"net/http"
"net/url"
"strings"
)
type SFIdentifier struct {
Name string `json:"name,omitempty"`
Details string `json:"details,omitempty"`
}
type SFMatches struct {
Ns string `json:"ns,omitempty"`
Id string `json:"id,omitempty"`
Format string `json:"format,omitempty"`
Version string `json:"version,omitempty"`
Mime string `json:"mime,omitempty"`
Basis string `json:"basis,omitempty"`
Warning string `json:"warning,omitempty"`
}
type SFFiles struct {
Filename string `json:"filename,omitempty"`
Filesize int64 `json:"filesize,omitempty"`
Modified string `json:"modified,omitempty"`
Errors string `json:"errors,omitempty"`
Matches []SFMatches `json:"matches,omitempty"`
}
type SF struct {
Siegfried string `json:"siegfried,omitempty"`
Scandate string `json:"scandate,omitempty"`
Signature string `json:"signature,omitempty"`
Created string `json:"created,omitempty"`
Identifiers []SFIdentifier `json:"identfiers,omitempty"`
Files []SFFiles `json:"files,omitempty"`
}
type Siegfried struct {
surl string
}
func (sf *Siegfried) Get( filename string ) ([]SFMatches, error) {
urlstring := strings.Replace(sf.surl, "[[PATH]]", url.QueryEscape(filename), -1)
resp, err := http.Get(urlstring)
if err != nil {
return nil, emperror.Wrapf(err, "cannot query siegfried - %v", urlstring)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, emperror.Wrapf(err, "status not ok - %v -> %v", urlstring, resp.Status)
}
bodyBytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, emperror.Wrapf(err, "error reading body - %v", urlstring)
}
result := SF{}
err = json.Unmarshal(bodyBytes, &result)
if err != nil {
return nil, emperror.Wrapf(err, "error decoding json - %v", string(bodyBytes))
}
if len(result.Files) == 0 {
return nil, emperror.Wrapf(err, "no file in sf result - %v", string(bodyBytes))
}
return result.Files[0].Matches, nil
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment