# https://github.com/opencultureconsulting/openrefine-task-runner version: '3' includes: example-doaj: example-doaj example-duplicates: example-duplicates example-powerhouse: example-powerhouse # add the directory name of your project here silent: true output: prefixed env: OPENREFINE: sh: readlink -m .openrefine/refine CLIENT: sh: readlink -m .openrefine/client tasks: default: desc: execute all projects in parallel deps: - task: example-doaj:refine - task: example-duplicates:refine - task: example-powerhouse:refine # add the directory name of your project here cmds: - task: check install: desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine cmds: - | # delete existing install and recreate folder rm -rf .openrefine mkdir -p .openrefine - > # download OpenRefine archive wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.0/openrefine-linux-3.5.0.tar.gz - | # install OpenRefine into subdirectory .openrefine tar -xzf openrefine.tar.gz -C .openrefine --strip 1 rm openrefine.tar.gz - | # optimize OpenRefine for batch processing sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours - > # download openrefine-client into subdirectory .openrefine wget --no-verbose -O .openrefine/client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux - chmod +x .openrefine/client # make client executable start: dir: ./{{.DIR}} cmds: - | # verify that OpenRefine is installed if [ ! -f "$OPENREFINE" ]; then echo 1>&2 "OpenRefine missing; try task install"; exit 1 fi - | # delete temporary files and log file of previous run rm -rf ./*.project* workspace.json rm -rf "{{.PROJECT}}.log" - > # launch OpenRefine with specific data directory and redirect its output to a log file "$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}} -d ../{{.DIR}} >> "{{.PROJECT}}.log" 2>&1 & - | # wait until OpenRefine API is available timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine do sleep 1 done" stop: dir: ./{{.DIR}} cmds: - | # shut down OpenRefine gracefully PID=$(lsof -t -i:{{.PORT}}) kill $PID while ps -p $PID > /dev/null; do sleep 1; done - > # archive the OpenRefine project tar cfz "{{.PROJECT}}.openrefine.tar.gz" -C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1) . - rm -rf ./*.project* workspace.json # delete temporary files kill: dir: ./{{.DIR}} cmds: - | # shut down OpenRefine immediately to save time and disk space PID=$(lsof -t -i:{{.PORT}}) kill -9 $PID while ps -p $PID > /dev/null; do sleep 1; done - rm -rf ./*.project* workspace.json # delete temporary files check: desc: check OpenRefine log for any warnings and exit on error dir: ./{{.DIR}} cmds: - | # find log file(s) and check for "exception" or "error" if grep -i 'exception\|error' $(find . -name '*.log'); then echo 1>&2 "log contains warnings!"; exit 1 fi