Commit 0247f5f7 authored by Jonas Waeber's avatar Jonas Waeber
Browse files

Remove old scripts

parent 208ba180
from kafka.admin import KafkaAdminClient
if __name__ == '__main__':
admin_client = KafkaAdminClient(
bootstrap_servers="mb-ka1.memobase.unibas.ch:9092,mb-ka2.memobase.unibas.ch:9092,mb-ka3.memobase.unibas.ch:9092",
client_id='admin-client-baz-mei'
)
delete_topics = list()
for item in admin_client.list_topics():
if item.startswith('p1'):
delete_topics.append(item)
admin_client.delete_topics(
delete_topics
)
import sys
from kafka import KafkaConsumer
if __name__ == '__main__':
consumer = KafkaConsumer(
bootstrap_servers="mb-ka1.memobase.unibas.ch:9092,"
"mb-ka2.memobase.unibas.ch:9092,"
"mb-ka3.memobase.unibas.ch:9092",
client_id='utility-consumer',
group_id='utility-consumer',
auto_offset_reset='earliest'
)
consumer.subscribe([sys.argv[1]])
for record in consumer:
print(record)
from sys import argv
from kafka.admin import KafkaAdminClient, NewTopic
from kafka.errors import TopicAlreadyExistsError
if __name__ == '__main__':
admin_client = KafkaAdminClient(
bootstrap_servers="mb-ka1.memobase.unibas.ch:9092,"
"mb-ka2.memobase.unibas.ch:9092,"
"mb-ka3.memobase.unibas.ch:9092",
client_id='admin-client-baz-mei'
)
new_topic = NewTopic(name=argv[1], num_partitions=3, replication_factor=1)
try:
admin_client.create_topics(new_topics=[new_topic], validate_only=False)
print(
f"Created the following "
f"topics: {new_topic.name} (p={new_topic.num_partitions}, rf={new_topic.replication_factor}).")
except TopicAlreadyExistsError:
print("Topics already exists!")
from kafka.admin import KafkaAdminClient
if __name__ == '__main__':
admin_client = KafkaAdminClient(
bootstrap_servers="mb-ka1.memobase.unibas.ch:9092,mb-ka2.memobase.unibas.ch:9092,mb-ka3.memobase.unibas.ch:9092",
client_id='admin-client-baz-mei'
)
for item in sorted(admin_client.list_topics()):
print(item)
"Andere","Andere","Autres","Altri"
"Geräusche","Geräusch","Bruit","Rumore"
"Musik","Musik","Musique","Musica"
"Stumm","Stumm","Muet","Muto"
"Dialekt","Dialekt","dialecte","dialetto"
"Patois romand","Westschweizer Dialekt","patois suisse romand","patois svizzero"
"Afrikanische Sprache","Afrikanische Sprache","langue d'Afrique","lingua d'Africa"
\ No newline at end of file
This diff is collapsed.
"Q1398263","Liechtensteinische Mundarten","","dialetto liechtensteinese"
"Q1435171","Solothurner Dialekt","",""
"Q4068780","Aargauerisch","","dialetto tedesco dell'Argovia"
"Q13601748","Penan","penan",""
"Q20012755","St.-Galler-Deutsch","","dialetto tedesco di San Gallo"
"Q65409718","","",""
"Q98969266","","",""
"Q98969282","Nidwaldner Dialekt","",""
"Q98969307","Schaffhauser Dialekt","",""
"Q98969316","","",""
"Q98969320","Thurgauer Dialekt","",""
"Q98969581","","",""
"Q98969595","Glarner Dialekt","",""
"Q98969600","","",""
"Q98969866","Zuger Dialekt","",""
SELECT ?item
WHERE
{
wd:PLACEHOLDER rdfs:label ?item .
FILTER(lang(?item) = "de" || lang(?item) = "fr" || lang(?item) = "it" )
}
\ No newline at end of file
import csv
import logging
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
def read_csv_file(path: str):
with open(path, 'r') as fp:
csv_rows = csv.reader(fp, dialect='unix')
ids = set()
strings = set()
# skip the header
logging.info("Reading language mapping file.")
next(csv_rows, None)
for r in csv_rows:
for index, item in enumerate(r[1:]):
if index <= 5 and item != "":
ids.add(item)
elif index > 5 and item != "":
strings.add(item)
logging.info("Collected all mapped wikidata identifiers and custom strings.")
logging.info(f"There are {len(ids)} unique wikidata identifiers present.")
logging.info(f"The following custom facet values are present: {', '.join(strings)}")
return ids, strings
if __name__ == '__main__':
source_path = '../../global-configs/prod/transforms/languages.csv'
wikidata_identifiers, custom_strings = read_csv_file(source_path)
logging.info("Check if all custom facet values are mapped to a label.")
with open('custom_labels.csv', 'r') as cl:
custom_label_text = cl.read()
custom_labels = csv.reader(custom_label_text.split('\n'), dialect='unix')
defined_labels = set()
for row in custom_labels:
defined_labels.add(row[0])
difference = custom_strings.difference(defined_labels)
if len(difference) > 0:
logging.error(f"The following custom facet values have no labels: {', '.join(difference)}.")
else:
logging.info("All custom facet values have a label defined.")
logging.info("Setting up connection to service.")
s = SPARQLWrapper("https://query.wikidata.org/sparql",
agent='Python Script (University Library Basel, jonas.waeber@unibas.ch)')
logging.info("Reading SPARQL template.")
with open('query.sparql', 'r') as sp:
request_template = sp.read()
logging.info("Writing the language labels file.")
missing_labels = list()
wikidata_identifiers = sorted(wikidata_identifiers, key=lambda x: int(x.replace('Q', '')))
with open('language_labels.csv', 'w') as w:
writer = csv.writer(w, dialect='unix')
writer.writerow(['id', 'de', 'fr', 'it'])
for q in wikidata_identifiers:
request = request_template.replace('PLACEHOLDER', q)
s.setQuery(request)
s.setReturnFormat(JSON)
logging.info(f"Query Wikidata service for value {q}.")
results = s.query().convert()
lang_values = dict()
for row in results['results']['bindings']:
lang_values[row['item']['xml:lang']] = row['item']['value']
de = lang_values['de'] if 'de' in lang_values else ''
fr = lang_values['fr'] if 'fr' in lang_values else ''
it = lang_values['it'] if 'it' in lang_values else ''
writer.writerow([q, de, fr, it])
if de == '' or fr == '' or it == '':
missing_labels.append([q, de, fr, it])
# add the custom facet value labels at the end.
w.write(custom_label_text)
if len(missing_labels) > 0:
logging.info("Writing missing labels.")
missing_labels = sorted(missing_labels, key=lambda x: int(x[0].replace('Q', '')))
with open('missing_labels.csv', 'w') as w:
writer = csv.writer(w, dialect='unix')
for row in missing_labels:
writer.writerow(row)
logging.info("Finished processing format labels.")
ID,ID_bisher
aag,StAAG
abe,StaBE
acj,ArCJ
adg,ADG
afz,AfZ
agl,LAGL
apf,PlansFixes
atd,STD
ati,ASTi
avl,AVL
azh,ZHdK_A
baa,USI_B
bab,BAB
bar,BAR
baz,BAZ
bbb,Burgerbib
bcf,BCUF
bfl,swisstopo
bmf,BMF
bpu,BPUN
bvc,BVCF
cde,Cde
cdt,CdT
cic,CIC
cic,ICRC
clg,CL
clu,C44
csa,CS
csa_1,
csa_2,
fad,ArchivioDonetta
fer,FER
fgr,FotoGR
fgr_1,
fpc,FPC
fss,FSS
gvs,GVS
hgk,FHNW_HGK
hsl,HSLU_DuK
ias,IASA
ikg,IKG
ikr,IFRC
kak,KanalK
kbg,KBGR
kek,KlosterEinsiedeln
khz,KH_ZH
klu,StLu
kmm,MuseeLaNeuveville
lfg,LJ
lkb,LS
lmz,SNM
maa,SMA
mav,Memoriav
mcl,MCA
mdl,MdL
meg,meg
mel,Elysee
mem,CentroElisarion
mfk,mfk
mgb,Museegruerien
mgz,ZHdK
mgz_1,
mhl,MHL
mov,Movendo
mws,Mediatheque
raf,Radio_Stadtfilter
rkk,KuK
rra,LoRa
rti,Radiotelevisionesvizzera
rtr,RTR
rts,RTS
rts_1,
rxb,RadioX
sap,Tanzarchiv
sap_1,
sap_2,
sbb,SBB
sik,SIK
snb,NB
snp,Fonoteca
son,sonohr
soz,SozArch
srf,SRF
srf_1,
srf_2,
sts,StSh
swi,Swissinfo
ubb,UBB
vks,Verkehrshaus
zbz,ZB
zem,ZEM
identifier_new,identifier_old,institution_id
csa-001,CS-CS_CF,csa
cdt-001,CdT-SON,cdt
ati-002,ASTi-FPC,ati
rti-001,Radiotelevisionesvizzera-Documentario,rti
srf-001,SRF-bv8,srf
srf-002,SRF-BPBV8,srf
srf-003,SRF-CH-M,srf
srf-004,SRF-Karussell,srf
srf-005,SRF-Kassensturz,srf
srf-006,SRF-LSR,srf
srf-007,SRF-MTW,srf
srf-008,SRF-Netto,srf
srf-009,SRF-PDW,srf
srf-010,SRF-RJ,srf
srf-011,SRF-RS,srf
srf-012,SRF-TS,srf
mfk-001,mfk-FLM,mfk
soz-005,SozArch-Sozarch_F_9045,soz
afz-001,AfZ-Lutz,afz
maa-001,SMA-IBA,maa
zem-001,ZEM-F,zem
soz-001,SozArch-F_9005,soz
soz-002,SozArch-F_9004,soz
lfg-001,LJ-Filmbestand_Langjahr_GmbH,lfg
kbg-001,KBGR-AV,kbg
maa-002,SMA-SK,maa
agl-001,LAGL-PA_111_Sch_Y,agl
bar-001,SFW_CJS_CGS-SFW_CJS_CGS,bar
bar-002,BAR-SABZ,bar
soz-003,SozArch-F_9003,soz
afz-002,AfZ-Bosshard,afz
mav-001,Memoriav-MB_alt_film,mav
clg-001,CL-Leuzinger,clg
lkb-001,LS-film,lkb
ati-001,ASTi-Monotti,ati
mem-001,CentroElisarion-vonKupffervonMayer,mem
fad-001,ArchivioDonetta-Donetta,fad
rti-002,Radiotelevisionesvizzera-DM,rti
cde-001,Cde-LEYDI,cde
fer-001,FER-RPN,fer
rti-003,Radiotelevisionesvizzera-Archiv,rti
cdt-002,CdT-gar,cdt
cic-001,ICRC-V-F-CR-H,cic
ikr-001,IFRC-FILM,ikr
mgb-001,Museegruerien-Morel,mgb
mhl-001,MHL-Constant,mhl
mws-001,Mediatheque-Schmid,mws
mgb-002,Museegruerien-Prangey,mgb
bcf-001,BCUF-Thevoz,bcf
mdl-001,MdL-ML,mdl
ias-001,IASA-Collart,ias
kmm-001,MuseeLaNeuveville-Hirt,kmm
mgb-003,Museegruerien-Glasson,mgb
bpu-001,BPUN-WIRI,bpu
rts-001,RTS-DM_GE,rts
rts-002,RTS-DM_LS,rts
rts-003,RTS-Patois,rts
avl-001,AVL-FIC,avl
mws-002,Mediatheque-Sonore,mws
acj-001,ArCJ-SAC,acj
cic-002,ICRC-V-S,cic
bvc-001,BVCF-Club44,bvc
acj-002,ArCJ-SP,acj
rts-004,RTS-Bonsoir,rts
mws-003,Mediatheque-f0047E,mws
rts-005,RTS-CSV,rts
apf-001,PlansFixes-All,apf
soz-004,SozArch-F_5146,soz
mav-002,Memoriav-MB_alt_foto,mav
baz-001,BAZ-B_MEI,baz
aag-001,StAAG-ATP-Personen,aag
abe-001,StaBE-Jost,abe
soz-015,SozArch-F_5003,soz
kek-001,KlosterEinsiedeln-KAE__F6_0,kek
adg-001,ADG-68432,adg
lmz-001,SNM-Arbeit,lmz
bar-003,BAR-E27,bar
fss-001,FSS-19,fss
fss-002,FSS-Blum,fss
fgr-001,FotoGR-GR-1000,fgr
fss-003,FSS-ES,fss
bbb-001,Burgerbib-Krebser,bbb
fss-004,FSS-HM,fss
fss-005,FSS-KG,fss
klu-001,StLu-JL,klu
klu-002,StLu-MAW,klu
fss-006,FSS-Braun,fss
fss-007,FSS-SWB,fss
fss-009,FSS-Unikat,fss
fss-008,FSS-DAL,fss
snb-001,NB-IMVOCS,snb
srf-028,SRF-Duer,srf
sap-027,STS-INV_STS,sap
ikg-001,IKG-Liedersammlung_Dr__Alfons_Maissen,ikg
rxb-002,RadioX-Agenda,rxb
srf-013,SRF-AkustischesErbeFR,srf
srf-014,SRF-EMusik,srf
srf-015,SRF-Feuilleton,srf
srf-016,SRF-Hoerspiele,srf
rtr-001,RTR-Radiobestand_Il_Patnal,rtr
srf-017,SRF-Information,srf
srf-018,SRF-Jazz,srf
rxb-003,RadioX-Kampagnen,rxb
srf-019,SRF-Kultur,srf
rxb-004,RadioX-Kulturtipp,rxb
srf-020,SRF-PERSOENLICH,srf
rxb-001,RadioX-Politspecial,rxb
srf-021,SRF-REGI,srf
kak-001,KanalK-SendungsArchiv,kak
srf-023,SRF-Sport,srf
swi-001,Swissinfo-All,swi
srf-024,SRF-UMusik,srf
srf-025,SRF-Unterhaltung,srf
soz-006,SozArch-F_1032,soz
rtr-002,RTR-DM,rtr
srf-026,SRF-DM,srf
soz-007,SozArch-F_1030,soz
rra-001,LoRa-Sendungsarchiv,rra
srf-027,SRF-2000,srf
snp-001,Fonoteca-CHRISTEN,snp
son-001,sonohr-sammlung,son
soz-008,SozArch-F_1006,soz
mgz-001,ZHdK-Schneckenburger,mgz
soz-009,SozArch-F_1015,soz
soz-010,SozArch-F_1005,soz
soz-011,SozArch-F_1013,soz
soz-012,SozArch-F_1000,soz
afz-003,AfZ-Becker-Audiovisuals,afz
zbz-001,ZB-Mus_NL_147,zbz
bmf-001,BMF-Ton,bmf
ubb-001,UBB-000121332,ubb
bab-001,BAB-PA_43,bab
sap-001,Tanzarchiv-42858,sap
sap-002,Tanzarchiv-10369,sap
sap-003,Tanzarchiv-44780,sap
sap-004,Tanzarchiv-9,sap
sap-005,Tanzarchiv-CabaretA,sap
sap-006,Tanzarchiv-121,sap
sap-007,Tanzarchiv-31344,sap
sap-008,Tanzarchiv-36135,sap
sap-009,Tanzarchiv-34420,sap
sap-010,Tanzarchiv-35303,sap
sap-011,Tanzarchiv-34695,sap
sap-012,Tanzarchiv-98,sap
sap-013,Tanzarchiv-35414,sap
sap-014,Tanzarchiv-49114,sap
sap-015,Tanzarchiv-39538,sap
sap-016,Tanzarchiv-31345,sap
sap-017,Tanzarchiv-45858,sap
sap-018,Tanzarchiv-132,sap
sap-019,Tanzarchiv-395,sap
soz-013,SozArch-F_9013,soz
afz-004,AfZ-IB_humem-Archiv,afz
sap-020,Tanzarchiv-163,sap
sap-021,Tanzarchiv-36560,sap
sts-001,StSh-V_I_02,sts
soz-014,SozArch-F_9011,soz
sap-022,Tanzarchiv-45857,sap
sap-023,Tanzarchiv-128,sap
sap-024,Tanzarchiv-141,sap
sap-025,Tanzarchiv-161,sap
sap-026,Tanzarchiv-31026,sap
mws-004,Mediatheque-f0199,mws
atd-001,STD-Dimitri,atd
cag-001,CIC-ASG,cag
bmf-002,BMF-Video,bmf
azh-001,ZHdK_A-Viper,azh
khz-001,KH_ZH-VS,khz
sik-001,SIK-VK,sik
vks-001,Verkehrshaus-video,vks
mcl-001,MCA-VideoArt,mcl
hgk-001,FHNW_HGK-VWW,hgk
snp-002,Fonoteca-VC-YM,snp
bbb-002,Burgerbib-Thierstein,bbb
bvc-002,BVCF-VCH-BVFSP_RC,bvc
soz-016,SozArch-F_StadtInBewegung,soz
ati-003,ASTi-Fototeca,ati
baa-001,USI_B-195,baa
hsl-001,HSLU_DuK-JAW,hsl
raf-001,Radio_Stadtfilter-archiv,raf
Museegruerien-Glasson
ArCJ-SP
Memoriav-BestandMitVideoAccesskopie
Tanzarchiv-34695
FSS-Braun
RadioX-Kampagnen
Memoriav-MB_alt_foto
Radiotelevisionesvizzera-DM
Tanzarchiv-31344
Verkehrshaus-video
SRF-RS
RTS-Patois
ICRC-V-F-CR-H
Tanzarchiv-395
SIK-VK
Mediatheque-Sonore
BAR-SABZ
SRF-LSR
SMA-IBA
FSS-KG
SozArch-Sozarch_F_9045
ZEM-F
ICRC-V-S
CIC-ASG
MCA-VideoArt
SRF-Kultur
SozArch-F_5146
FSS-Blum
Tanzarchiv-35414
SRF-Karussell
Tanzarchiv-132
ZHdK-Schneckenburger
Tanzarchiv-CabaretA
ADG-68432
BAR-E27
Tanzarchiv-34420
SNM-Arbeit
SRF-2000
FSS-19
RTS-DM_GE
Tanzarchiv-121
KlosterEinsiedeln-KAE__F6_0
LS-film
Tanzarchiv-45858
SRF-TS
SRF-Jazz
SRF-Duer
Tanzarchiv-36135
Mediatheque-f0199
SozArch-F_1005
StSh-V_I_02
SozArch-F_5003
Memoriav-KitchenSinkBestandPrivate
AfZ-Bosshard
SozArch-F_9004
Tanzarchiv-10369
SozArch-F_9011
AfZ-Becker-Audiovisuals
SRF-RegioJournalZHSH
BVCF-Club44
NB-IMVOCS
MHL-Constant
FSS-HM
ZHdK-CM
SozArch-F_1013
SRF-bv8
SRF-Hoerspiele
SRF-RJ
SMA-SK
IASA-Collart
ZHdK_A-Viper
SRF-REGI
Radiotelevisionesvizzera-TerzaPagina
Tanzarchiv-36560
StaBE-Jost
StAAG-ATP-Personen
SRF-PDW
ArCJ-SAC
ASTi-Monotti
Memoriav-Test-Docuteam
AfZ-Lutz
SRF-Feuilleton
CL-Leuzinger
FSS-Unikat
Mediatheque-MV_Film
SRF-BPBV8
Tanzarchiv-44780
FSS-SWB
Tanzarchiv-39538
UBB-000121332
RTS-Bonsoir
CentroElisarion-vonKupffervonMayer
SozArch-F_9003
BPUN-WIRI
BMF-Video
Tanzarchiv-35303
RadioX-Agenda
IFRC-FILM
AfZ-IB_humem-Archiv
CdT-gar
BCUF-Thevoz
BAZ-B_MEI
SozArch-F_1030
Burgerbib-Krebser
Museegruerien-Prangey
SozArch-F_1015
FER-RPN
Tanzarchiv-49114
KanalK-SendungsArchiv
BMF-Ton
SRF-MTW
SozArch-F_1032
RTS-DM_LS
Radiotelevisionesvizzera-Documentario
STD-Dimitri
SRF-DM
SRF-Sport
RTR-DM
RadioX-Kulturtipp
RadioX-Politspecial
Tanzarchiv-42858
ArchivioDonetta-Donetta
BAB-PA_43
PlansFixes-All
Tanzarchiv-161
Fonoteca-CHRISTEN
SozArch-F_9005
SozArch-F_9013
SozArch-F_1000
Museegruerien-Morel
LoRa-Sendungsarchiv
ASTi-FPC
CS-CS_CF
FHNW_HGK-VWW
LJ-Filmbestand_Langjahr_GmbH
Memoriav-MB_alt_film
SRF-CH-M
SRF-Regi
ZB-Mus_NL_147
Mediatheque-f0047E
RTS-CSV
SFW_CJS_CGS-SFW_CJS_CGS
SRF-Kassensturz
SRF-Netto
SRF-Rundschau
Tanzarchiv-31345
SRF-Rundschau1968-1975
SRF-UMusik
Tanzarchiv-45857
KBGR-AV
AVL-FIC
FSS-ES