Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
memoriav
Memobase 2020
utilities
Import Process CLI
Commits
0247f5f7
Commit
0247f5f7
authored
Oct 04, 2021
by
Jonas Waeber
Browse files
Remove old scripts
parent
208ba180
Changes
56
Expand all
Hide whitespace changes
Inline
Side-by-side
utilities/kafka/clear_topics.py
deleted
100644 → 0
View file @
208ba180
from
kafka.admin
import
KafkaAdminClient
if
__name__
==
'__main__'
:
admin_client
=
KafkaAdminClient
(
bootstrap_servers
=
"mb-ka1.memobase.unibas.ch:9092,mb-ka2.memobase.unibas.ch:9092,mb-ka3.memobase.unibas.ch:9092"
,
client_id
=
'admin-client-baz-mei'
)
delete_topics
=
list
()
for
item
in
admin_client
.
list_topics
():
if
item
.
startswith
(
'p1'
):
delete_topics
.
append
(
item
)
admin_client
.
delete_topics
(
delete_topics
)
utilities/kafka/consume_topic.py
deleted
100644 → 0
View file @
208ba180
import
sys
from
kafka
import
KafkaConsumer
if
__name__
==
'__main__'
:
consumer
=
KafkaConsumer
(
bootstrap_servers
=
"mb-ka1.memobase.unibas.ch:9092,"
"mb-ka2.memobase.unibas.ch:9092,"
"mb-ka3.memobase.unibas.ch:9092"
,
client_id
=
'utility-consumer'
,
group_id
=
'utility-consumer'
,
auto_offset_reset
=
'earliest'
)
consumer
.
subscribe
([
sys
.
argv
[
1
]])
for
record
in
consumer
:
print
(
record
)
utilities/kafka/create_topic.py
deleted
100644 → 0
View file @
208ba180
from
sys
import
argv
from
kafka.admin
import
KafkaAdminClient
,
NewTopic
from
kafka.errors
import
TopicAlreadyExistsError
if
__name__
==
'__main__'
:
admin_client
=
KafkaAdminClient
(
bootstrap_servers
=
"mb-ka1.memobase.unibas.ch:9092,"
"mb-ka2.memobase.unibas.ch:9092,"
"mb-ka3.memobase.unibas.ch:9092"
,
client_id
=
'admin-client-baz-mei'
)
new_topic
=
NewTopic
(
name
=
argv
[
1
],
num_partitions
=
3
,
replication_factor
=
1
)
try
:
admin_client
.
create_topics
(
new_topics
=
[
new_topic
],
validate_only
=
False
)
print
(
f
"Created the following "
f
"topics:
{
new_topic
.
name
}
(p=
{
new_topic
.
num_partitions
}
, rf=
{
new_topic
.
replication_factor
}
)."
)
except
TopicAlreadyExistsError
:
print
(
"Topics already exists!"
)
utilities/kafka/list_topics.py
deleted
100644 → 0
View file @
208ba180
from
kafka.admin
import
KafkaAdminClient
if
__name__
==
'__main__'
:
admin_client
=
KafkaAdminClient
(
bootstrap_servers
=
"mb-ka1.memobase.unibas.ch:9092,mb-ka2.memobase.unibas.ch:9092,mb-ka3.memobase.unibas.ch:9092"
,
client_id
=
'admin-client-baz-mei'
)
for
item
in
sorted
(
admin_client
.
list_topics
()):
print
(
item
)
utilities/languages/custom_labels.csv
deleted
100644 → 0
View file @
208ba180
"Andere","Andere","Autres","Altri"
"Geräusche","Geräusch","Bruit","Rumore"
"Musik","Musik","Musique","Musica"
"Stumm","Stumm","Muet","Muto"
"Dialekt","Dialekt","dialecte","dialetto"
"Patois romand","Westschweizer Dialekt","patois suisse romand","patois svizzero"
"Afrikanische Sprache","Afrikanische Sprache","langue d'Afrique","lingua d'Africa"
\ No newline at end of file
utilities/languages/language_labels.csv
deleted
100644 → 0
View file @
208ba180
This diff is collapsed.
Click to expand it.
utilities/languages/missing_labels.csv
deleted
100644 → 0
View file @
208ba180
"Q1398263","Liechtensteinische Mundarten","","dialetto liechtensteinese"
"Q1435171","Solothurner Dialekt","",""
"Q4068780","Aargauerisch","","dialetto tedesco dell'Argovia"
"Q13601748","Penan","penan",""
"Q20012755","St.-Galler-Deutsch","","dialetto tedesco di San Gallo"
"Q65409718","","",""
"Q98969266","","",""
"Q98969282","Nidwaldner Dialekt","",""
"Q98969307","Schaffhauser Dialekt","",""
"Q98969316","","",""
"Q98969320","Thurgauer Dialekt","",""
"Q98969581","","",""
"Q98969595","Glarner Dialekt","",""
"Q98969600","","",""
"Q98969866","Zuger Dialekt","",""
utilities/languages/query.sparql
deleted
100644 → 0
View file @
208ba180
SELECT ?item
WHERE
{
wd:PLACEHOLDER rdfs:label ?item .
FILTER(lang(?item) = "de" || lang(?item) = "fr" || lang(?item) = "it" )
}
\ No newline at end of file
utilities/languages/script.py
deleted
100644 → 0
View file @
208ba180
import
csv
import
logging
import
sys
from
SPARQLWrapper
import
SPARQLWrapper
,
JSON
logging
.
basicConfig
(
stream
=
sys
.
stdout
,
level
=
logging
.
INFO
)
def
read_csv_file
(
path
:
str
):
with
open
(
path
,
'r'
)
as
fp
:
csv_rows
=
csv
.
reader
(
fp
,
dialect
=
'unix'
)
ids
=
set
()
strings
=
set
()
# skip the header
logging
.
info
(
"Reading language mapping file."
)
next
(
csv_rows
,
None
)
for
r
in
csv_rows
:
for
index
,
item
in
enumerate
(
r
[
1
:]):
if
index
<=
5
and
item
!=
""
:
ids
.
add
(
item
)
elif
index
>
5
and
item
!=
""
:
strings
.
add
(
item
)
logging
.
info
(
"Collected all mapped wikidata identifiers and custom strings."
)
logging
.
info
(
f
"There are
{
len
(
ids
)
}
unique wikidata identifiers present."
)
logging
.
info
(
f
"The following custom facet values are present:
{
', '
.
join
(
strings
)
}
"
)
return
ids
,
strings
if
__name__
==
'__main__'
:
source_path
=
'../../global-configs/prod/transforms/languages.csv'
wikidata_identifiers
,
custom_strings
=
read_csv_file
(
source_path
)
logging
.
info
(
"Check if all custom facet values are mapped to a label."
)
with
open
(
'custom_labels.csv'
,
'r'
)
as
cl
:
custom_label_text
=
cl
.
read
()
custom_labels
=
csv
.
reader
(
custom_label_text
.
split
(
'
\n
'
),
dialect
=
'unix'
)
defined_labels
=
set
()
for
row
in
custom_labels
:
defined_labels
.
add
(
row
[
0
])
difference
=
custom_strings
.
difference
(
defined_labels
)
if
len
(
difference
)
>
0
:
logging
.
error
(
f
"The following custom facet values have no labels:
{
', '
.
join
(
difference
)
}
."
)
else
:
logging
.
info
(
"All custom facet values have a label defined."
)
logging
.
info
(
"Setting up connection to service."
)
s
=
SPARQLWrapper
(
"https://query.wikidata.org/sparql"
,
agent
=
'Python Script (University Library Basel, jonas.waeber@unibas.ch)'
)
logging
.
info
(
"Reading SPARQL template."
)
with
open
(
'query.sparql'
,
'r'
)
as
sp
:
request_template
=
sp
.
read
()
logging
.
info
(
"Writing the language labels file."
)
missing_labels
=
list
()
wikidata_identifiers
=
sorted
(
wikidata_identifiers
,
key
=
lambda
x
:
int
(
x
.
replace
(
'Q'
,
''
)))
with
open
(
'language_labels.csv'
,
'w'
)
as
w
:
writer
=
csv
.
writer
(
w
,
dialect
=
'unix'
)
writer
.
writerow
([
'id'
,
'de'
,
'fr'
,
'it'
])
for
q
in
wikidata_identifiers
:
request
=
request_template
.
replace
(
'PLACEHOLDER'
,
q
)
s
.
setQuery
(
request
)
s
.
setReturnFormat
(
JSON
)
logging
.
info
(
f
"Query Wikidata service for value
{
q
}
."
)
results
=
s
.
query
().
convert
()
lang_values
=
dict
()
for
row
in
results
[
'results'
][
'bindings'
]:
lang_values
[
row
[
'item'
][
'xml:lang'
]]
=
row
[
'item'
][
'value'
]
de
=
lang_values
[
'de'
]
if
'de'
in
lang_values
else
''
fr
=
lang_values
[
'fr'
]
if
'fr'
in
lang_values
else
''
it
=
lang_values
[
'it'
]
if
'it'
in
lang_values
else
''
writer
.
writerow
([
q
,
de
,
fr
,
it
])
if
de
==
''
or
fr
==
''
or
it
==
''
:
missing_labels
.
append
([
q
,
de
,
fr
,
it
])
# add the custom facet value labels at the end.
w
.
write
(
custom_label_text
)
if
len
(
missing_labels
)
>
0
:
logging
.
info
(
"Writing missing labels."
)
missing_labels
=
sorted
(
missing_labels
,
key
=
lambda
x
:
int
(
x
[
0
].
replace
(
'Q'
,
''
)))
with
open
(
'missing_labels.csv'
,
'w'
)
as
w
:
writer
=
csv
.
writer
(
w
,
dialect
=
'unix'
)
for
row
in
missing_labels
:
writer
.
writerow
(
row
)
logging
.
info
(
"Finished processing format labels."
)
utilities/mappings/institution_ids.csv
deleted
100644 → 0
View file @
208ba180
ID,ID_bisher
aag,StAAG
abe,StaBE
acj,ArCJ
adg,ADG
afz,AfZ
agl,LAGL
apf,PlansFixes
atd,STD
ati,ASTi
avl,AVL
azh,ZHdK_A
baa,USI_B
bab,BAB
bar,BAR
baz,BAZ
bbb,Burgerbib
bcf,BCUF
bfl,swisstopo
bmf,BMF
bpu,BPUN
bvc,BVCF
cde,Cde
cdt,CdT
cic,CIC
cic,ICRC
clg,CL
clu,C44
csa,CS
csa_1,
csa_2,
fad,ArchivioDonetta
fer,FER
fgr,FotoGR
fgr_1,
fpc,FPC
fss,FSS
gvs,GVS
hgk,FHNW_HGK
hsl,HSLU_DuK
ias,IASA
ikg,IKG
ikr,IFRC
kak,KanalK
kbg,KBGR
kek,KlosterEinsiedeln
khz,KH_ZH
klu,StLu
kmm,MuseeLaNeuveville
lfg,LJ
lkb,LS
lmz,SNM
maa,SMA
mav,Memoriav
mcl,MCA
mdl,MdL
meg,meg
mel,Elysee
mem,CentroElisarion
mfk,mfk
mgb,Museegruerien
mgz,ZHdK
mgz_1,
mhl,MHL
mov,Movendo
mws,Mediatheque
raf,Radio_Stadtfilter
rkk,KuK
rra,LoRa
rti,Radiotelevisionesvizzera
rtr,RTR
rts,RTS
rts_1,
rxb,RadioX
sap,Tanzarchiv
sap_1,
sap_2,
sbb,SBB
sik,SIK
snb,NB
snp,Fonoteca
son,sonohr
soz,SozArch
srf,SRF
srf_1,
srf_2,
sts,StSh
swi,Swissinfo
ubb,UBB
vks,Verkehrshaus
zbz,ZB
zem,ZEM
utilities/mappings/record_sets_ids.csv
deleted
100644 → 0
View file @
208ba180
identifier_new,identifier_old,institution_id
csa-001,CS-CS_CF,csa
cdt-001,CdT-SON,cdt
ati-002,ASTi-FPC,ati
rti-001,Radiotelevisionesvizzera-Documentario,rti
srf-001,SRF-bv8,srf
srf-002,SRF-BPBV8,srf
srf-003,SRF-CH-M,srf
srf-004,SRF-Karussell,srf
srf-005,SRF-Kassensturz,srf
srf-006,SRF-LSR,srf
srf-007,SRF-MTW,srf
srf-008,SRF-Netto,srf
srf-009,SRF-PDW,srf
srf-010,SRF-RJ,srf
srf-011,SRF-RS,srf
srf-012,SRF-TS,srf
mfk-001,mfk-FLM,mfk
soz-005,SozArch-Sozarch_F_9045,soz
afz-001,AfZ-Lutz,afz
maa-001,SMA-IBA,maa
zem-001,ZEM-F,zem
soz-001,SozArch-F_9005,soz
soz-002,SozArch-F_9004,soz
lfg-001,LJ-Filmbestand_Langjahr_GmbH,lfg
kbg-001,KBGR-AV,kbg
maa-002,SMA-SK,maa
agl-001,LAGL-PA_111_Sch_Y,agl
bar-001,SFW_CJS_CGS-SFW_CJS_CGS,bar
bar-002,BAR-SABZ,bar
soz-003,SozArch-F_9003,soz
afz-002,AfZ-Bosshard,afz
mav-001,Memoriav-MB_alt_film,mav
clg-001,CL-Leuzinger,clg
lkb-001,LS-film,lkb
ati-001,ASTi-Monotti,ati
mem-001,CentroElisarion-vonKupffervonMayer,mem
fad-001,ArchivioDonetta-Donetta,fad
rti-002,Radiotelevisionesvizzera-DM,rti
cde-001,Cde-LEYDI,cde
fer-001,FER-RPN,fer
rti-003,Radiotelevisionesvizzera-Archiv,rti
cdt-002,CdT-gar,cdt
cic-001,ICRC-V-F-CR-H,cic
ikr-001,IFRC-FILM,ikr
mgb-001,Museegruerien-Morel,mgb
mhl-001,MHL-Constant,mhl
mws-001,Mediatheque-Schmid,mws
mgb-002,Museegruerien-Prangey,mgb
bcf-001,BCUF-Thevoz,bcf
mdl-001,MdL-ML,mdl
ias-001,IASA-Collart,ias
kmm-001,MuseeLaNeuveville-Hirt,kmm
mgb-003,Museegruerien-Glasson,mgb
bpu-001,BPUN-WIRI,bpu
rts-001,RTS-DM_GE,rts
rts-002,RTS-DM_LS,rts
rts-003,RTS-Patois,rts
avl-001,AVL-FIC,avl
mws-002,Mediatheque-Sonore,mws
acj-001,ArCJ-SAC,acj
cic-002,ICRC-V-S,cic
bvc-001,BVCF-Club44,bvc
acj-002,ArCJ-SP,acj
rts-004,RTS-Bonsoir,rts
mws-003,Mediatheque-f0047E,mws
rts-005,RTS-CSV,rts
apf-001,PlansFixes-All,apf
soz-004,SozArch-F_5146,soz
mav-002,Memoriav-MB_alt_foto,mav
baz-001,BAZ-B_MEI,baz
aag-001,StAAG-ATP-Personen,aag
abe-001,StaBE-Jost,abe
soz-015,SozArch-F_5003,soz
kek-001,KlosterEinsiedeln-KAE__F6_0,kek
adg-001,ADG-68432,adg
lmz-001,SNM-Arbeit,lmz
bar-003,BAR-E27,bar
fss-001,FSS-19,fss
fss-002,FSS-Blum,fss
fgr-001,FotoGR-GR-1000,fgr
fss-003,FSS-ES,fss
bbb-001,Burgerbib-Krebser,bbb
fss-004,FSS-HM,fss
fss-005,FSS-KG,fss
klu-001,StLu-JL,klu
klu-002,StLu-MAW,klu
fss-006,FSS-Braun,fss
fss-007,FSS-SWB,fss
fss-009,FSS-Unikat,fss
fss-008,FSS-DAL,fss
snb-001,NB-IMVOCS,snb
srf-028,SRF-Duer,srf
sap-027,STS-INV_STS,sap
ikg-001,IKG-Liedersammlung_Dr__Alfons_Maissen,ikg
rxb-002,RadioX-Agenda,rxb
srf-013,SRF-AkustischesErbeFR,srf
srf-014,SRF-EMusik,srf
srf-015,SRF-Feuilleton,srf
srf-016,SRF-Hoerspiele,srf
rtr-001,RTR-Radiobestand_Il_Patnal,rtr
srf-017,SRF-Information,srf
srf-018,SRF-Jazz,srf
rxb-003,RadioX-Kampagnen,rxb
srf-019,SRF-Kultur,srf
rxb-004,RadioX-Kulturtipp,rxb
srf-020,SRF-PERSOENLICH,srf
rxb-001,RadioX-Politspecial,rxb
srf-021,SRF-REGI,srf
kak-001,KanalK-SendungsArchiv,kak
srf-023,SRF-Sport,srf
swi-001,Swissinfo-All,swi
srf-024,SRF-UMusik,srf
srf-025,SRF-Unterhaltung,srf
soz-006,SozArch-F_1032,soz
rtr-002,RTR-DM,rtr
srf-026,SRF-DM,srf
soz-007,SozArch-F_1030,soz
rra-001,LoRa-Sendungsarchiv,rra
srf-027,SRF-2000,srf
snp-001,Fonoteca-CHRISTEN,snp
son-001,sonohr-sammlung,son
soz-008,SozArch-F_1006,soz
mgz-001,ZHdK-Schneckenburger,mgz
soz-009,SozArch-F_1015,soz
soz-010,SozArch-F_1005,soz
soz-011,SozArch-F_1013,soz
soz-012,SozArch-F_1000,soz
afz-003,AfZ-Becker-Audiovisuals,afz
zbz-001,ZB-Mus_NL_147,zbz
bmf-001,BMF-Ton,bmf
ubb-001,UBB-000121332,ubb
bab-001,BAB-PA_43,bab
sap-001,Tanzarchiv-42858,sap
sap-002,Tanzarchiv-10369,sap
sap-003,Tanzarchiv-44780,sap
sap-004,Tanzarchiv-9,sap
sap-005,Tanzarchiv-CabaretA,sap
sap-006,Tanzarchiv-121,sap
sap-007,Tanzarchiv-31344,sap
sap-008,Tanzarchiv-36135,sap
sap-009,Tanzarchiv-34420,sap
sap-010,Tanzarchiv-35303,sap
sap-011,Tanzarchiv-34695,sap
sap-012,Tanzarchiv-98,sap
sap-013,Tanzarchiv-35414,sap
sap-014,Tanzarchiv-49114,sap
sap-015,Tanzarchiv-39538,sap
sap-016,Tanzarchiv-31345,sap
sap-017,Tanzarchiv-45858,sap
sap-018,Tanzarchiv-132,sap
sap-019,Tanzarchiv-395,sap
soz-013,SozArch-F_9013,soz
afz-004,AfZ-IB_humem-Archiv,afz
sap-020,Tanzarchiv-163,sap
sap-021,Tanzarchiv-36560,sap
sts-001,StSh-V_I_02,sts
soz-014,SozArch-F_9011,soz
sap-022,Tanzarchiv-45857,sap
sap-023,Tanzarchiv-128,sap
sap-024,Tanzarchiv-141,sap
sap-025,Tanzarchiv-161,sap
sap-026,Tanzarchiv-31026,sap
mws-004,Mediatheque-f0199,mws
atd-001,STD-Dimitri,atd
cag-001,CIC-ASG,cag
bmf-002,BMF-Video,bmf
azh-001,ZHdK_A-Viper,azh
khz-001,KH_ZH-VS,khz
sik-001,SIK-VK,sik
vks-001,Verkehrshaus-video,vks
mcl-001,MCA-VideoArt,mcl
hgk-001,FHNW_HGK-VWW,hgk
snp-002,Fonoteca-VC-YM,snp
bbb-002,Burgerbib-Thierstein,bbb
bvc-002,BVCF-VCH-BVFSP_RC,bvc
soz-016,SozArch-F_StadtInBewegung,soz
ati-003,ASTi-Fototeca,ati
baa-001,USI_B-195,baa
hsl-001,HSLU_DuK-JAW,hsl
raf-001,Radio_Stadtfilter-archiv,raf
utilities/migration/all-record-set-ids.txt
deleted
100644 → 0
View file @
208ba180
Museegruerien-Glasson
ArCJ-SP
Memoriav-BestandMitVideoAccesskopie
Tanzarchiv-34695
FSS-Braun
RadioX-Kampagnen
Memoriav-MB_alt_foto
Radiotelevisionesvizzera-DM
Tanzarchiv-31344
Verkehrshaus-video
SRF-RS
RTS-Patois
ICRC-V-F-CR-H
Tanzarchiv-395
SIK-VK
Mediatheque-Sonore
BAR-SABZ
SRF-LSR
SMA-IBA
FSS-KG
SozArch-Sozarch_F_9045
ZEM-F
ICRC-V-S
CIC-ASG
MCA-VideoArt
SRF-Kultur
SozArch-F_5146
FSS-Blum
Tanzarchiv-35414
SRF-Karussell
Tanzarchiv-132
ZHdK-Schneckenburger
Tanzarchiv-CabaretA
ADG-68432
BAR-E27
Tanzarchiv-34420
SNM-Arbeit
SRF-2000
FSS-19
RTS-DM_GE
Tanzarchiv-121
KlosterEinsiedeln-KAE__F6_0
LS-film
Tanzarchiv-45858
SRF-TS
SRF-Jazz
SRF-Duer
Tanzarchiv-36135
Mediatheque-f0199
SozArch-F_1005
StSh-V_I_02
SozArch-F_5003
Memoriav-KitchenSinkBestandPrivate
AfZ-Bosshard
SozArch-F_9004
Tanzarchiv-10369
SozArch-F_9011
AfZ-Becker-Audiovisuals
SRF-RegioJournalZHSH
BVCF-Club44
NB-IMVOCS
MHL-Constant
FSS-HM
ZHdK-CM
SozArch-F_1013
SRF-bv8
SRF-Hoerspiele
SRF-RJ
SMA-SK
IASA-Collart
ZHdK_A-Viper
SRF-REGI
Radiotelevisionesvizzera-TerzaPagina
Tanzarchiv-36560
StaBE-Jost
StAAG-ATP-Personen
SRF-PDW
ArCJ-SAC
ASTi-Monotti
Memoriav-Test-Docuteam
AfZ-Lutz
SRF-Feuilleton
CL-Leuzinger
FSS-Unikat
Mediatheque-MV_Film
SRF-BPBV8
Tanzarchiv-44780
FSS-SWB
Tanzarchiv-39538
UBB-000121332
RTS-Bonsoir
CentroElisarion-vonKupffervonMayer
SozArch-F_9003
BPUN-WIRI
BMF-Video
Tanzarchiv-35303
RadioX-Agenda
IFRC-FILM
AfZ-IB_humem-Archiv
CdT-gar
BCUF-Thevoz
BAZ-B_MEI
SozArch-F_1030
Burgerbib-Krebser
Museegruerien-Prangey
SozArch-F_1015
FER-RPN
Tanzarchiv-49114
KanalK-SendungsArchiv
BMF-Ton
SRF-MTW
SozArch-F_1032
RTS-DM_LS
Radiotelevisionesvizzera-Documentario
STD-Dimitri
SRF-DM
SRF-Sport
RTR-DM
RadioX-Kulturtipp
RadioX-Politspecial
Tanzarchiv-42858
ArchivioDonetta-Donetta
BAB-PA_43
PlansFixes-All
Tanzarchiv-161
Fonoteca-CHRISTEN
SozArch-F_9005
SozArch-F_9013
SozArch-F_1000
Museegruerien-Morel
LoRa-Sendungsarchiv
ASTi-FPC
CS-CS_CF
FHNW_HGK-VWW
LJ-Filmbestand_Langjahr_GmbH
Memoriav-MB_alt_film
SRF-CH-M
SRF-Regi
ZB-Mus_NL_147
Mediatheque-f0047E
RTS-CSV
SFW_CJS_CGS-SFW_CJS_CGS
SRF-Kassensturz
SRF-Netto
SRF-Rundschau
Tanzarchiv-31345
SRF-Rundschau1968-1975
SRF-UMusik
Tanzarchiv-45857
KBGR-AV
AVL-FIC
FSS-ES
Tanzarchiv-141
SozArch-F_1006