2021-03-02 13:32:12 +01:00
|
|
|
# https://github.com/opencultureconsulting/openrefine-task-runner
|
2021-01-14 16:51:39 +01:00
|
|
|
|
|
|
|
version: '3'
|
|
|
|
|
|
|
|
includes:
|
2021-05-11 22:20:40 +02:00
|
|
|
bielefeld: bielefeld
|
2021-03-02 13:32:12 +01:00
|
|
|
muenster: muenster
|
|
|
|
siegen: siegen
|
|
|
|
wuppertal: wuppertal
|
|
|
|
|
|
|
|
silent: true
|
|
|
|
output: prefixed
|
2021-01-14 16:51:39 +01:00
|
|
|
|
|
|
|
vars:
|
|
|
|
DATE: '{{ now | date "2006-01-02"}}'
|
|
|
|
|
|
|
|
env:
|
|
|
|
OPENREFINE:
|
2021-03-02 13:32:12 +01:00
|
|
|
sh: readlink -m .openrefine/refine
|
|
|
|
CLIENT:
|
|
|
|
sh: readlink -m .openrefine/client
|
2021-01-14 16:51:39 +01:00
|
|
|
|
|
|
|
tasks:
|
|
|
|
default:
|
2021-03-02 13:32:12 +01:00
|
|
|
desc: execute all projects in parallel
|
2021-01-14 16:51:39 +01:00
|
|
|
deps:
|
2021-05-11 22:20:40 +02:00
|
|
|
- task: bielefeld:main
|
2021-03-02 13:32:12 +01:00
|
|
|
- task: muenster:main
|
|
|
|
- task: siegen:main
|
|
|
|
- task: wuppertal:main
|
2021-01-19 12:30:46 +01:00
|
|
|
|
2021-03-02 13:32:12 +01:00
|
|
|
install:
|
|
|
|
desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine
|
2021-01-20 15:35:02 +01:00
|
|
|
cmds:
|
2021-03-02 13:32:12 +01:00
|
|
|
- | # delete existing install and recreate folder
|
|
|
|
rm -rf .openrefine
|
|
|
|
mkdir -p .openrefine
|
|
|
|
- > # download OpenRefine archive
|
|
|
|
wget --no-verbose -O openrefine.tar.gz
|
|
|
|
https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
|
|
|
|
- | # install OpenRefine into subdirectory .openrefine
|
|
|
|
tar -xzf openrefine.tar.gz -C .openrefine --strip 1
|
|
|
|
rm openrefine.tar.gz
|
|
|
|
- | # optimize OpenRefine for batch processing
|
|
|
|
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file
|
|
|
|
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser
|
|
|
|
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
|
|
|
|
- > # download openrefine-client into subdirectory .openrefine
|
|
|
|
wget --no-verbose -O .openrefine/client
|
|
|
|
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
|
|
|
|
- chmod +x .openrefine/client # make client executable
|
|
|
|
|
|
|
|
start:
|
|
|
|
dir: ./{{.PROJECT}}/refine
|
2021-01-20 15:35:02 +01:00
|
|
|
cmds:
|
2021-03-02 13:32:12 +01:00
|
|
|
- | # verify that OpenRefine is installed
|
|
|
|
if [ ! -f "$OPENREFINE" ]; then
|
|
|
|
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
|
|
|
fi
|
|
|
|
- | # delete temporary files and log file of previous run
|
|
|
|
rm -rf ./*.project* workspace.json
|
|
|
|
rm -rf "{{.PROJECT}}.log"
|
|
|
|
- > # launch OpenRefine with specific data directory and redirect its output to a log file
|
|
|
|
"$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}}
|
|
|
|
-d ../{{.PROJECT}}/refine
|
|
|
|
>> "{{.PROJECT}}.log" 2>&1 &
|
|
|
|
- | # wait until OpenRefine API is available
|
|
|
|
timeout 30s bash -c "until
|
|
|
|
wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
|
|
|
|
do sleep 1
|
|
|
|
done"
|
|
|
|
|
|
|
|
stop:
|
|
|
|
dir: ./{{.PROJECT}}/refine
|
|
|
|
cmds:
|
|
|
|
- | # shut down OpenRefine gracefully
|
|
|
|
PID=$(lsof -t -i:{{.PORT}})
|
|
|
|
kill $PID
|
|
|
|
while ps -p $PID > /dev/null; do sleep 1; done
|
|
|
|
- > # archive the OpenRefine project
|
|
|
|
tar cfz
|
|
|
|
"{{.PROJECT}}.openrefine.tar.gz"
|
|
|
|
-C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1)
|
|
|
|
.
|
|
|
|
- rm -rf ./*.project* workspace.json # delete temporary files
|
|
|
|
|
|
|
|
kill:
|
|
|
|
dir: ./{{.PROJECT}}/refine
|
|
|
|
cmds:
|
|
|
|
- | # shut down OpenRefine immediately to save time and disk space
|
|
|
|
PID=$(lsof -t -i:{{.PORT}})
|
|
|
|
kill -9 $PID
|
|
|
|
while ps -p $PID > /dev/null; do sleep 1; done
|
|
|
|
- rm -rf ./*.project* workspace.json # delete temporary files
|
2021-01-20 15:35:02 +01:00
|
|
|
|
2021-01-19 12:30:46 +01:00
|
|
|
check:
|
2021-03-02 13:32:12 +01:00
|
|
|
dir: ./{{.PROJECT}}/refine
|
2021-01-19 12:30:46 +01:00
|
|
|
cmds:
|
2021-03-02 13:32:12 +01:00
|
|
|
- | # find log file(s) and check for "exception" or "error"
|
|
|
|
if grep -i 'exception\|error' $(find . -name '*.log'); then
|
|
|
|
echo 1>&2 "log contains warnings!"; exit 1
|
|
|
|
fi
|
|
|
|
- | # Prüfen, ob Mindestanzahl von Datensätzen generiert wurde
|
|
|
|
if (( {{.MINIMUM}} > $(grep -c recordIdentifier {{.PROJECT}}.txt) )); then
|
|
|
|
echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/{{.PROJECT}}.txt!"; exit 1
|
|
|
|
fi
|
2021-01-19 12:30:46 +01:00
|
|
|
|
|
|
|
split:
|
2021-01-20 12:30:21 +01:00
|
|
|
label: '{{.TASK}}-{{.PROJECT}}'
|
2021-03-02 13:32:12 +01:00
|
|
|
dir: ./{{.PROJECT}}/split
|
2021-01-19 12:30:46 +01:00
|
|
|
cmds:
|
2021-01-19 19:23:26 +01:00
|
|
|
- test -n "{{.PROJECT}}"
|
2021-01-19 12:30:46 +01:00
|
|
|
# in Einzeldateien aufteilen
|
2021-02-03 15:11:17 +01:00
|
|
|
- csplit -s -z ../refine/{{.PROJECT}}.txt '/<mets:mets /' "{*}"
|
2021-01-19 12:30:46 +01:00
|
|
|
# ggf. vorhandene XML-Dateien löschen
|
|
|
|
- rm -f *.xml
|
|
|
|
# Identifier als Dateinamen
|
|
|
|
- for f in xx*; do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"; done
|
2021-01-20 12:16:34 +01:00
|
|
|
sources:
|
|
|
|
- ../refine/{{.PROJECT}}.txt
|
|
|
|
generates:
|
|
|
|
- ./*.xml
|
2021-01-19 12:30:46 +01:00
|
|
|
|
|
|
|
validate:
|
2021-01-20 12:30:21 +01:00
|
|
|
label: '{{.TASK}}-{{.PROJECT}}'
|
2021-03-02 13:32:12 +01:00
|
|
|
dir: ./{{.PROJECT}}/validate
|
2021-01-19 12:30:46 +01:00
|
|
|
cmds:
|
2021-01-19 19:23:26 +01:00
|
|
|
- test -n "{{.PROJECT}}"
|
2021-01-19 12:30:46 +01:00
|
|
|
# Validierung gegen METS Schema
|
|
|
|
- wget -q -nc https://www.loc.gov/standards/mets/mets.xsd
|
2021-03-02 13:32:12 +01:00
|
|
|
- xmllint --schema mets.xsd --noout ../split/*.xml > validate.log 2>&1
|
2021-01-20 12:16:34 +01:00
|
|
|
sources:
|
2021-03-02 13:32:12 +01:00
|
|
|
- ../split/*.xml
|
2021-01-20 12:16:34 +01:00
|
|
|
generates:
|
|
|
|
- validate.log
|
2021-01-19 12:30:46 +01:00
|
|
|
|
|
|
|
zip:
|
2021-01-20 12:30:21 +01:00
|
|
|
label: '{{.TASK}}-{{.PROJECT}}'
|
2021-03-02 13:32:12 +01:00
|
|
|
dir: ./{{.PROJECT}}/zip
|
2021-01-19 12:30:46 +01:00
|
|
|
cmds:
|
2021-01-19 19:23:26 +01:00
|
|
|
- test -n "{{.PROJECT}}"
|
2021-01-19 12:30:46 +01:00
|
|
|
# ZIP-Archiv mit Zeitstempel erstellen
|
2021-03-02 13:32:12 +01:00
|
|
|
- zip -q -FS -j {{.PROJECT}}_{{.DATE}}.zip ../split/*.xml
|
2021-01-20 12:16:34 +01:00
|
|
|
sources:
|
2021-03-02 13:32:12 +01:00
|
|
|
- ../split/*.xml
|
2021-01-20 12:16:34 +01:00
|
|
|
generates:
|
2021-01-20 12:30:21 +01:00
|
|
|
- '{{.PROJECT}}_{{.DATE}}.zip'
|
2021-01-19 12:30:46 +01:00
|
|
|
|
|
|
|
diff:
|
2021-01-20 12:30:21 +01:00
|
|
|
label: '{{.TASK}}-{{.PROJECT}}'
|
2021-03-02 13:32:12 +01:00
|
|
|
dir: ./{{.PROJECT}}
|
2021-01-19 12:30:46 +01:00
|
|
|
cmds:
|
2021-01-19 19:23:26 +01:00
|
|
|
- test -n "{{.PROJECT}}"
|
2021-01-19 12:30:46 +01:00
|
|
|
# Inhalt der beiden letzten ZIP-Archive vergleichen
|
2021-03-02 13:32:12 +01:00
|
|
|
- if test -n "$(ls -t zip/*.zip | sed -n 2p)"; then unzip -q -d old $(ls -t zip/*.zip | sed -n 2p); unzip -q -d new $(ls -t zip/*.zip | sed -n 1p); fi
|
2021-01-19 12:30:46 +01:00
|
|
|
- diff -d old new > diff.log || exit 0
|
|
|
|
- rm -rf old new
|
|
|
|
# Diff prüfen, ob es weniger als 500 Zeilen enthält
|
|
|
|
- if (( 500 < $(wc -l <diff.log) )); then echo 1>&2 "Unerwartet große Änderungen in $PWD/diff.log!" && exit 1; fi
|
2021-01-19 13:33:32 +01:00
|
|
|
# Diff archivieren
|
2021-03-02 13:32:12 +01:00
|
|
|
- cp diff.log zip/{{.PROJECT}}_{{.DATE}}.diff
|
2021-01-20 12:16:34 +01:00
|
|
|
sources:
|
|
|
|
- split/*.xml
|
|
|
|
generates:
|
|
|
|
- diff.log
|
2021-01-19 12:30:46 +01:00
|
|
|
|
|
|
|
linkcheck:
|
2021-01-20 12:30:21 +01:00
|
|
|
label: '{{.TASK}}-{{.PROJECT}}'
|
2021-03-02 13:32:12 +01:00
|
|
|
dir: ./{{.PROJECT}}
|
2021-01-19 12:30:46 +01:00
|
|
|
cmds:
|
2021-01-19 19:23:26 +01:00
|
|
|
- test -n "{{.PROJECT}}"
|
2021-01-19 12:30:46 +01:00
|
|
|
# Links extrahieren
|
2021-03-02 14:59:24 +01:00
|
|
|
- grep -o 'href="[^"]*"' split/*.xml | sed 's/:href=/\t/' | tr -d '"' | sort -k 2 --unique > links.txt
|
|
|
|
# http status code ermitteln
|
|
|
|
- awk '{ print "url = " $2 "\noutput = /dev/null"; }' links.txt > curl.cfg
|
|
|
|
- curl --silent --head --location --write-out "%{http_code}\t%{url_effective}\n" --config curl.cfg > curl.log
|
|
|
|
# Tabelle mit status code, effektiver URL, Dateiname und start URL erstellen
|
|
|
|
- paste curl.log links.txt > linkcheck.log
|
|
|
|
- rm -rf curl.cfg curl.log links.txt
|
2021-01-19 12:30:46 +01:00
|
|
|
# Logdatei auf status code != 2XX prüfen
|
|
|
|
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
|
2021-01-20 12:16:34 +01:00
|
|
|
sources:
|
|
|
|
- split/*.xml
|
|
|
|
generates:
|
|
|
|
- linkcheck.log
|
2021-01-19 12:30:46 +01:00
|
|
|
|
|
|
|
delete:
|
2021-01-20 12:30:21 +01:00
|
|
|
label: '{{.TASK}}-{{.PROJECT}}'
|
2021-03-02 13:32:12 +01:00
|
|
|
dir: ./{{.PROJECT}}
|
2021-01-19 12:30:46 +01:00
|
|
|
cmds:
|
2021-01-19 19:23:26 +01:00
|
|
|
- test -n "{{.PROJECT}}"
|
2021-01-19 12:30:46 +01:00
|
|
|
- rm -rf harvest
|
|
|
|
- rm -rf refine
|
|
|
|
- rm -rf split
|
2021-03-02 13:32:12 +01:00
|
|
|
- rm -rf validate
|
|
|
|
- rm -f diff.log
|