Script zum Download der Metadaten (Kap 2.2)

2025-05-11 00:00:39 +02:00 · 2017-03-21 23:16:11 +01:00 · 2017-03-21 23:16:11 +01:00 · 4952b22088
commit 4952b22088
parent c8d3705459
1 changed files with 81 additions and 0 deletions
--- a/scripte/sru-download.sh
+++ b/scripte/sru-download.sh
@ -0,0 +1,81 @@
+#!/bin/bash
+# Script zum Download von Metadaten über SRU-Schnittstellen mit curl
+# sru-download.sh, Felix Lohmeier, v0.2, 16.03.2017
+# https://github.com/felixlohmeier/seminar-praxis-der-digitalen-bibliothek
+
+# Variablen (bei Bedarf hier anpassen)
+url="http://sru.gbv.de/opac-de-960-3"
+query="pica.ppn=.*"
+format="marcxml"
+outputdir="download"
+filename="hsh_ksf"
+recordlimitperquery=100
+# Weitere technische Variablen
+date="$(date +%F)"
+datelog="$(date +%Y%m%d_%H%M%S)"
+command="?operation=searchRetrieve"
+startrecord=1
+let counter=startrecord+recordlimitperquery-1
+
+# Verzeichnis erstellen (falls nicht vorhanden)
+mkdir -p $outputdir
+
+# Ausgabe parallel in eine Logdatei schreiben
+exec &> >(tee -a "$outputdir/${filename}_${datelog}.log")
+
+# Anzahl der Datensätze auslesen
+records=$(curl --silent "${url}${command}&query=${query}&recordSchema=${format}" | sed 's/</\n/g' | sed '/^\//d' | sed 's/:/\n/g' | grep numberOfRecords | cut -c 17-)
+
+# Variablen ausgeben
+echo "SRU-Schnittstelle:       ${url}"
+echo "Suchabfrage:             ${query}"
+echo "Format:                  ${format}"
+echo "Anzahl Datensätze:       ${records}"
+echo "Datensätze pro Datei:    ${recordlimitperquery}"
+echo "Download in Verzeichnis: $(readlink -f ${outputdir})"
+echo "Beispiel Dateiname:      ${filename}_${date}_$(printf "%.7i\n" ${startrecord})-$(printf "%.7i\n" ${counter}).xml"
+echo "Logdatei:                ${filename}_${datelog}.log"
+echo ""
+
+# Startzeitpunkt ausgeben
+echo "Startzeitpunkt: $(date)"
+echo ""
+
+# Schleife mit Aufruf von curl
+while (("$counter" <= "$records")) ; do
+    echo "Download Records "${startrecord}" bis "${counter}"..."
+    curl "${url}${command}&query=${query}&maximumRecords=${recordlimitperquery}&recordSchema=${format}&startRecord=${startrecord}" > $outputdir/${filename}_${date}_$(printf "%.7i\n" ${startrecord})-$(printf "%.7i\n" ${counter}).xml
+    # Sofortige Prüfung des Downloads, wenn Format marcxml
+    if [ $format = "marcxml" ]; then
+        echo "Ergebnis: "$(grep -c -H '<controlfield tag="001">' $outputdir/${filename}_${date}_$(printf "%.7i\n" ${startrecord})-$(printf "%.7i\n" ${counter}).xml)" records"
+    fi
+    echo ""
+    let counter=counter+recordlimitperquery
+    let startrecord=startrecord+recordlimitperquery
+done
+
+# Prüfung des Downloads, wenn Format marcxml
+if [ "$format" = "marcxml" ]; then
+    echo "Gesamtanzahl der Records im Ordner download:"
+    grep '<controlfield tag="001">' $outputdir/*.xml | wc -l
+    echo ""
+    echo "Dateien, die weniger als 10 Records enthalten:"
+    testfiles=($(find "$outputdir" -type f -name '*.xml'))
+    for i in "${testfiles[@]}" ; do
+        testfilerecords="$(grep -c -h '<controlfield tag="001">' ${i})"
+        if (("${testfilerecords}" < "10")); then
+            echo 1>&2 "${i}: ${testfilerecords}"
+        fi
+    done
+    echo ""
+fi
+
+# Prüfung, ob sich während des Downloads die Datenbank geändert hat
+recordsafterdownload=$(curl --silent "${url}${command}&query=${query}&recordSchema=${format}" | sed 's/</\n/g' | sed '/^\//d' | sed 's/:/\n/g' | grep numberOfRecords | cut -c 17-)
+if [ "$records" != "$recordsafterdownload" ]; then
+    echo 1>&2 "Warnung: Die Suchabfrage an die SRU-Schnittstelle hat vor Beginn des Downloads eine andere Gesamtanzahl an Datensätzen ergeben (${records}) als nach dem Download (${recordsafterdownload}). Das ist ein Indiz dafür, dass die Datenbank zwischenzeitlich verändert wurde. Es ist wahrscheinlich, dass dadurch einzelne Datensätze im Download fehlen."
+fi
+
+# Endzeitpunkt ausgeben
+echo "Endzeitpunkt: $(date)"
+echo ""