openrefine-task-runner/example-powerhouse/Taskfile.yml

80 lines
2.7 KiB
YAML
Raw Normal View History

2021-02-20 00:22:12 +01:00
version: '3'
tasks:
main:
desc: Powerhouse Museum Tutorial
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine:
vars:
PORT: 3336 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}'
deps: # will be executed each run independent of up-to-date check
- task: download
cmds: # tasks prepended with ":" are defined in Taskfile.yml
- task: :start
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: apply
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: export
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: stats
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: :stop
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- input/**
- config/**
generates:
- output/openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
download:
cmds:
- mkdir -p input config
- wget --no-verbose -O input/phm-collection.tsv https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
- wget --no-verbose -O config/phm-transform.json https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
import:
dir: input
cmds:
- | # import file
../../openrefine/client -P {{.PORT}} \
--create phm-collection.tsv \
--processQuotes false \
--guessCellValueTypes true \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply phm-transform.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m phm-results.tsv)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
default: # enable standalone execution (running `task` in project directory)
cmds:
- PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"