# https://github.com/opencultureconsulting/openrefine-task-runner version: '3' includes: bielefeld: bielefeld muenster: muenster siegen: siegen wuppertal: wuppertal silent: true output: prefixed vars: DATE: '{{ now | date "2006-01-02"}}' env: OPENREFINE: sh: readlink -m .openrefine/refine CLIENT: sh: readlink -m .openrefine/client tasks: default: desc: execute all projects in parallel deps: - task: bielefeld:main - task: muenster:main - task: siegen:main - task: wuppertal:main install: desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine cmds: - | # delete existing install and recreate folder rm -rf .openrefine mkdir -p .openrefine - > # download OpenRefine archive wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz - | # install OpenRefine into subdirectory .openrefine tar -xzf openrefine.tar.gz -C .openrefine --strip 1 rm openrefine.tar.gz - | # optimize OpenRefine for batch processing sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours - > # download openrefine-client into subdirectory .openrefine wget --no-verbose -O .openrefine/client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux - chmod +x .openrefine/client # make client executable start: dir: ./{{.PROJECT}}/refine cmds: - | # verify that OpenRefine is installed if [ ! -f "$OPENREFINE" ]; then echo 1>&2 "OpenRefine missing; try task install"; exit 1 fi - | # delete temporary files and log file of previous run rm -rf ./*.project* workspace.json rm -rf "{{.PROJECT}}.log" - > # launch OpenRefine with specific data directory and redirect its output to a log file "$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}} -d ../{{.PROJECT}}/refine >> "{{.PROJECT}}.log" 2>&1 & - | # wait until OpenRefine API is available timeout 30s bash -c "until wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine do sleep 1 done" stop: dir: ./{{.PROJECT}}/refine cmds: - | # shut down OpenRefine gracefully PID=$(lsof -t -i:{{.PORT}}) kill $PID while ps -p $PID > /dev/null; do sleep 1; done - > # archive the OpenRefine project tar cfz "{{.PROJECT}}.openrefine.tar.gz" -C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1) . - rm -rf ./*.project* workspace.json # delete temporary files kill: dir: ./{{.PROJECT}}/refine cmds: - | # shut down OpenRefine immediately to save time and disk space PID=$(lsof -t -i:{{.PORT}}) kill -9 $PID while ps -p $PID > /dev/null; do sleep 1; done - rm -rf ./*.project* workspace.json # delete temporary files check: dir: ./{{.PROJECT}}/refine cmds: - | # find log file(s) and check for "exception" or "error" if grep -i 'exception\|error' $(find . -name '*.log'); then echo 1>&2 "log contains warnings!"; exit 1 fi - | # Prüfen, ob Mindestanzahl von Datensätzen generiert wurde if (( {{.MINIMUM}} > $(grep -c recordIdentifier {{.PROJECT}}.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/{{.PROJECT}}.txt!"; exit 1 fi split: label: '{{.TASK}}-{{.PROJECT}}' dir: ./{{.PROJECT}}/split cmds: - test -n "{{.PROJECT}}" # in Einzeldateien aufteilen - csplit -s -z ../refine/{{.PROJECT}}.txt '/ validate.log 2>&1 sources: - ../split/*.xml generates: - validate.log zip: label: '{{.TASK}}-{{.PROJECT}}' dir: ./{{.PROJECT}}/zip cmds: - test -n "{{.PROJECT}}" # ZIP-Archiv mit Zeitstempel erstellen - zip -q -FS -j {{.PROJECT}}_{{.DATE}}.zip ../split/*.xml sources: - ../split/*.xml generates: - '{{.PROJECT}}_{{.DATE}}.zip' diff: label: '{{.TASK}}-{{.PROJECT}}' dir: ./{{.PROJECT}} cmds: - test -n "{{.PROJECT}}" # Inhalt der beiden letzten ZIP-Archive vergleichen - if test -n "$(ls -t zip/*.zip | sed -n 2p)"; then unzip -q -d old $(ls -t zip/*.zip | sed -n 2p); unzip -q -d new $(ls -t zip/*.zip | sed -n 1p); fi - diff -d old new > diff.log || exit 0 - rm -rf old new # Diff prüfen, ob es weniger als 500 Zeilen enthält - if (( 500 < $(wc -l &2 "Unerwartet große Änderungen in $PWD/diff.log!" && exit 1; fi # Diff archivieren - cp diff.log zip/{{.PROJECT}}_{{.DATE}}.diff sources: - split/*.xml generates: - diff.log linkcheck: label: '{{.TASK}}-{{.PROJECT}}' dir: ./{{.PROJECT}} cmds: - test -n "{{.PROJECT}}" # Links extrahieren - grep -o 'href="[^"]*"' split/*.xml | sed 's/:href=/\t/' | tr -d '"' | sort -k 2 --unique > links.txt # http status code ermitteln - awk '{ print "url = " $2 "\noutput = /dev/null"; }' links.txt > curl.cfg - curl --silent --head --location --write-out "%{http_code}\t%{url_effective}\n" --config curl.cfg > curl.log # Tabelle mit status code, effektiver URL, Dateiname und start URL erstellen - paste curl.log links.txt > linkcheck.log - rm -rf curl.cfg curl.log links.txt # Logdatei auf status code != 2XX prüfen - if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi sources: - split/*.xml generates: - linkcheck.log delete: label: '{{.TASK}}-{{.PROJECT}}' dir: ./{{.PROJECT}} cmds: - test -n "{{.PROJECT}}" - rm -rf harvest - rm -rf refine - rm -rf split - rm -rf validate - rm -f diff.log