🎨 restructure taskfiles

This commit is contained in:
Felix Lohmeier 2021-02-23 17:11:59 +01:00
parent 62eb0cddbf
commit 6789554c60
6 changed files with 127 additions and 175 deletions

2
.gitignore vendored
View File

@ -1,6 +1,8 @@
.task
openrefine
*/output
*/openrefine.log
*/*.openrefine.tar.gz
example-doaj/input
example-doaj/config
example-powerhouse/input

View File

@ -139,4 +139,4 @@ Please file an [issue](https://github.com/opencultureconsulting/openrefine-task-
- [ ] how-to for extracting input options from OpenRefine GUI (via metadata in open project)
- [ ] document known issues, e.g. [import xls, xlsx, ods](https://github.com/opencultureconsulting/openrefine-client/issues/4)
- [ ] add Binder files and badge
- [ ] add example notebooks (links to nbviewer and Binder)
- [ ] add example notebooks (links to nbviewer and Binder)

View File

@ -3,15 +3,9 @@
version: '3'
includes:
example-doaj:
taskfile: example-doaj
dir: example-doaj
example-duplicates:
taskfile: example-duplicates
dir: example-duplicates
example-powerhouse:
taskfile: example-powerhouse
dir: example-powerhouse
example-doaj: example-doaj
example-duplicates: example-duplicates
example-powerhouse: example-powerhouse
# add your project here
silent: true
@ -32,29 +26,35 @@ tasks:
desc: (re)install OpenRefine and openrefine-client into subdirectory openrefine
cmds:
- | # delete existing install and recreate folder
rm -rf openrefine; mkdir -p openrefine
- | # install OpenRefine into subdirectory openrefine
wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
tar -xzf openrefine.tar.gz -C openrefine --strip 1 && rm openrefine.tar.gz
- sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file
- sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser
- sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
- | # install openrefine-client into subdirectory openrefine
wget --no-verbose -O openrefine/client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
chmod +x openrefine/client
rm -rf openrefine
mkdir -p openrefine
- > # download OpenRefine archive
wget --no-verbose -O openrefine.tar.gz
https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
- > # install OpenRefine into subdirectory openrefine
tar -xzf openrefine.tar.gz -C openrefine --strip 1
&& rm openrefine.tar.gz
- | # optimize OpenRefine for batch processing
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
- > # download openrefine-client into subdirectory openrefine
wget --no-verbose -O openrefine/client
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
&& chmod +x openrefine/client
start:
dir: ./{{.PROJECT}}/output
dir: ./{{.PROJECT}}
cmds:
- | # check install and delete any temporary OpenRefine files
if [ ! -f "../../openrefine/refine" ]; then
if [ ! -f "../openrefine/refine" ]; then
echo 1>&2 "OpenRefine missing; try task install"; exit 1
fi
rm -rf ./*.project* workspace.json
- | # launch OpenRefine with specific data directory and redirect its output to a log file
../../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}} \
-d ../{{.PROJECT}}/output \
> openrefine.log 2>&1 &
- > # launch OpenRefine with specific data directory and redirect its output to a log file
../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}}
-d ../{{.PROJECT}}
> openrefine.log 2>&1 &
- | # wait until OpenRefine API is available
timeout 30s bash -c "until
wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
@ -62,17 +62,18 @@ tasks:
done"
stop:
dir: ./{{.PROJECT}}/output
dir: ./{{.PROJECT}}
cmds:
- | # shut down OpenRefine
PID=$(lsof -t -i:{{.PORT}})
kill $PID
while ps -p $PID > /dev/null; do sleep 1; done
- | # archive the OpenRefine project
tar cfz \
{{.PROJECT}}.openrefine.tar.gz \
-C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1) \
.
- > # archive the OpenRefine project and delete temporary files
tar cfz
{{.PROJECT}}.openrefine.tar.gz
-C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1)
.
&& rm -rf ./*.project* workspace.json
check:
desc: check OpenRefine log for any warnings and exit on error

View File

@ -6,71 +6,56 @@ tasks:
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine:
dir: ./{{.PROJECT}}
vars:
PORT: 3335 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}'
deps: # will be executed each run independent of up-to-date check
- task: download
cmds: # tasks prepended with ":" are defined in Taskfile.yml
- task: :start
cmds:
- task: :start # launch OpenRefine
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: apply
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: export
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: stats
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: :stop
- > # import file
../openrefine/client -P {{.PORT}}
--create "$(readlink -m input/doaj-article-sample.csv)"
--projectName {{.PROJECT}}
- > # apply transformation rules
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--apply config/doaj-openrefine.json
- > # export to file
mkdir -p output &&
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/doaj-results.tsv)"
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- input/**
- config/**
generates:
- output/openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
- openrefine.log
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error
# https://github.com/go-task/task/issues/141
download:
dir: '{{splitList ":" .TASK | first}}'
cmds:
- mkdir -p input config
- wget --no-verbose -O input/doaj-article-sample.csv https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
- wget --no-verbose -O config/doaj-openrefine.json https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
import:
dir: input
cmds:
- | # import file
../../openrefine/client -P {{.PORT}} \
--create doaj-article-sample.csv \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply doaj-openrefine.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m doaj-results.tsv)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
- > # Download input
wget --no-verbose -O input/doaj-article-sample.csv
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
- > # Download config
wget --no-verbose -O config/doaj-openrefine.json
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
default: # enable standalone execution (running `task` in project directory)
cmds:

View File

@ -6,64 +6,44 @@ tasks:
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine:
dir: ./{{.PROJECT}}
vars:
PORT: 3334 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}'
cmds: # tasks prepended with ":" are defined in Taskfile.yml
- task: :start
cmds:
- task: :start # launch OpenRefine
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: apply
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: export
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: stats
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: :stop
- > # import file
../openrefine/client -P {{.PORT}}
--create "$(readlink -m input/duplicates.csv)"
--encoding UTF-8
--projectName {{.PROJECT}}
- > # apply transformation rules
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--apply config/duplicates-deletion.json
- > # export to file
mkdir -p output &&
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/deduped.xls)"
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- input/**
- config/**
generates:
- output/openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
import:
dir: input
cmds:
- | # import file
../../openrefine/client -P {{.PORT}} \
--create duplicates.csv \
--encoding UTF-8 \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply duplicates-deletion.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m deduped.xls)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
- openrefine.log
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error
# https://github.com/go-task/task/issues/141
default: # enable standalone execution (running `task` in project directory)
cmds:

View File

@ -6,73 +6,57 @@ tasks:
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine:
dir: ./{{.PROJECT}}
vars:
PORT: 3336 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}'
deps: # will be executed each run independent of up-to-date check
- task: download
cmds: # tasks prepended with ":" are defined in Taskfile.yml
- task: :start
cmds:
- task: :start # launch OpenRefine
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: apply
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: export
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: stats
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: :stop
- > # import file
../openrefine/client -P {{.PORT}}
--create "$(readlink -m input/phm-collection.tsv)"
--processQuotes false
--guessCellValueTypes true
--projectName {{.PROJECT}}
- > # apply transformation rules
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--apply config/phm-transform.json
- > # export to file
mkdir -p output &&
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/phm-results.tsv)"
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- input/**
- config/**
generates:
- output/openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
- openrefine.log
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error
# https://github.com/go-task/task/issues/141
download:
dir: '{{splitList ":" .TASK | first}}'
cmds:
- mkdir -p input config
- wget --no-verbose -O input/phm-collection.tsv https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
- wget --no-verbose -O config/phm-transform.json https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
import:
dir: input
cmds:
- | # import file
../../openrefine/client -P {{.PORT}} \
--create phm-collection.tsv \
--processQuotes false \
--guessCellValueTypes true \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply phm-transform.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m phm-results.tsv)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
- > # Download input
wget --no-verbose -O input/phm-collection.tsv
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
- > # Download config
wget --no-verbose -O config/phm-transform.json
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
default: # enable standalone execution (running `task` in project directory)
cmds: