2021-02-23 22:45:03 +01:00
|
|
|
# https://github.com/opencultureconsulting/openrefine-task-runner
|
2021-02-20 00:22:12 +01:00
|
|
|
|
|
|
|
version: '3'
|
|
|
|
|
|
|
|
includes:
|
2021-02-23 17:11:59 +01:00
|
|
|
example-doaj: example-doaj
|
|
|
|
example-duplicates: example-duplicates
|
|
|
|
example-powerhouse: example-powerhouse
|
2021-02-23 22:45:03 +01:00
|
|
|
# add the directory name of your project here
|
2021-02-20 00:22:12 +01:00
|
|
|
|
|
|
|
silent: true
|
|
|
|
output: prefixed
|
|
|
|
|
2021-02-23 22:45:03 +01:00
|
|
|
env:
|
|
|
|
OPENREFINE:
|
2021-02-25 13:16:16 +01:00
|
|
|
sh: readlink -m .openrefine/refine
|
2021-02-23 22:45:03 +01:00
|
|
|
CLIENT:
|
2021-02-25 13:16:16 +01:00
|
|
|
sh: readlink -m .openrefine/client
|
2021-02-23 22:45:03 +01:00
|
|
|
|
2021-02-20 00:22:12 +01:00
|
|
|
tasks:
|
|
|
|
default:
|
|
|
|
desc: execute all projects in parallel
|
|
|
|
deps:
|
|
|
|
- task: example-doaj:refine
|
|
|
|
- task: example-duplicates:refine
|
|
|
|
- task: example-powerhouse:refine
|
2021-02-23 22:45:03 +01:00
|
|
|
# add the directory name of your project here
|
2021-02-20 00:22:12 +01:00
|
|
|
cmds:
|
|
|
|
- task: check
|
|
|
|
|
|
|
|
install:
|
2021-02-25 13:16:16 +01:00
|
|
|
desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine
|
2021-02-20 00:22:12 +01:00
|
|
|
cmds:
|
|
|
|
- | # delete existing install and recreate folder
|
2021-02-25 13:16:16 +01:00
|
|
|
rm -rf .openrefine
|
|
|
|
mkdir -p .openrefine
|
2021-02-23 17:11:59 +01:00
|
|
|
- > # download OpenRefine archive
|
|
|
|
wget --no-verbose -O openrefine.tar.gz
|
2021-11-09 23:46:29 +01:00
|
|
|
https://github.com/OpenRefine/OpenRefine/releases/download/3.5.0/openrefine-linux-3.5.0.tar.gz
|
2021-02-25 13:16:16 +01:00
|
|
|
- | # install OpenRefine into subdirectory .openrefine
|
|
|
|
tar -xzf openrefine.tar.gz -C .openrefine --strip 1
|
2021-02-25 12:59:45 +01:00
|
|
|
rm openrefine.tar.gz
|
2021-02-23 17:11:59 +01:00
|
|
|
- | # optimize OpenRefine for batch processing
|
2021-02-25 13:16:16 +01:00
|
|
|
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file
|
|
|
|
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser
|
|
|
|
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
|
|
|
|
- > # download openrefine-client into subdirectory .openrefine
|
|
|
|
wget --no-verbose -O .openrefine/client
|
2021-02-23 17:11:59 +01:00
|
|
|
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
|
2021-02-25 13:16:16 +01:00
|
|
|
- chmod +x .openrefine/client # make client executable
|
2021-02-20 00:22:12 +01:00
|
|
|
|
|
|
|
start:
|
2021-02-23 22:45:03 +01:00
|
|
|
dir: ./{{.DIR}}
|
2021-02-20 00:22:12 +01:00
|
|
|
cmds:
|
2021-02-25 12:59:45 +01:00
|
|
|
- | # verify that OpenRefine is installed
|
2021-02-23 22:45:03 +01:00
|
|
|
if [ ! -f "$OPENREFINE" ]; then
|
2021-02-20 00:22:12 +01:00
|
|
|
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
|
|
|
fi
|
2021-02-25 12:59:45 +01:00
|
|
|
- | # delete temporary files and log file of previous run
|
|
|
|
rm -rf ./*.project* workspace.json
|
|
|
|
rm -rf "{{.PROJECT}}.log"
|
2021-02-23 17:11:59 +01:00
|
|
|
- > # launch OpenRefine with specific data directory and redirect its output to a log file
|
2021-02-23 22:45:03 +01:00
|
|
|
"$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}}
|
|
|
|
-d ../{{.DIR}}
|
2021-02-24 21:06:12 +01:00
|
|
|
>> "{{.PROJECT}}.log" 2>&1 &
|
2021-02-20 00:22:12 +01:00
|
|
|
- | # wait until OpenRefine API is available
|
|
|
|
timeout 30s bash -c "until
|
2021-06-11 19:32:58 +02:00
|
|
|
wget -q -O - -o /dev/null http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
|
2021-02-20 00:22:12 +01:00
|
|
|
do sleep 1
|
|
|
|
done"
|
|
|
|
|
|
|
|
stop:
|
2021-02-23 22:45:03 +01:00
|
|
|
dir: ./{{.DIR}}
|
2021-02-20 00:22:12 +01:00
|
|
|
cmds:
|
2021-02-25 12:59:45 +01:00
|
|
|
- | # shut down OpenRefine gracefully
|
2021-02-20 00:22:12 +01:00
|
|
|
PID=$(lsof -t -i:{{.PORT}})
|
|
|
|
kill $PID
|
|
|
|
while ps -p $PID > /dev/null; do sleep 1; done
|
2021-02-25 12:59:45 +01:00
|
|
|
- > # archive the OpenRefine project
|
2021-02-23 17:11:59 +01:00
|
|
|
tar cfz
|
2021-02-23 22:45:03 +01:00
|
|
|
"{{.PROJECT}}.openrefine.tar.gz"
|
|
|
|
-C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1)
|
2021-02-23 17:11:59 +01:00
|
|
|
.
|
2021-02-25 12:59:45 +01:00
|
|
|
- rm -rf ./*.project* workspace.json # delete temporary files
|
2021-02-20 00:22:12 +01:00
|
|
|
|
2021-02-25 13:01:20 +01:00
|
|
|
kill:
|
|
|
|
dir: ./{{.DIR}}
|
|
|
|
cmds:
|
|
|
|
- | # shut down OpenRefine immediately to save time and disk space
|
2021-02-25 14:19:07 +01:00
|
|
|
PID=$(lsof -t -i:{{.PORT}})
|
|
|
|
kill -9 $PID
|
|
|
|
while ps -p $PID > /dev/null; do sleep 1; done
|
2021-02-25 13:01:20 +01:00
|
|
|
- rm -rf ./*.project* workspace.json # delete temporary files
|
|
|
|
|
2021-02-20 00:22:12 +01:00
|
|
|
check:
|
|
|
|
desc: check OpenRefine log for any warnings and exit on error
|
2021-02-23 22:45:03 +01:00
|
|
|
dir: ./{{.DIR}}
|
2021-02-20 00:22:12 +01:00
|
|
|
cmds:
|
|
|
|
- | # find log file(s) and check for "exception" or "error"
|
2021-02-24 21:59:56 +01:00
|
|
|
if grep -i 'exception\|error' $(find . -name '*.log'); then
|
2021-02-20 00:22:12 +01:00
|
|
|
echo 1>&2 "log contains warnings!"; exit 1
|
|
|
|
fi
|