openrefine-task-runner/example-powerhouse/Taskfile.yml

73 lines
2.8 KiB
YAML

version: '3'
tasks:
main:
desc: Powerhouse Museum Tutorial
vars:
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
refine:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: phm
PORT: 3336 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
deps:
- task: download # will be executed each run independent of up-to-date check
cmds:
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # import file
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/phm-collection.tsv)"
--processQuotes false
--guessCellValueTypes true
--projectName "{{.PROJECT}}"
> {{.LOG}}
- > # apply transformation rules
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/phm-transform.json
> {{.LOG}}
- mkdir -p output
- > # export to file
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--output "$(readlink -m output/phm-results.tsv)"
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', PROJECT: '{{.PROJECT}}'}
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
download:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
cmds:
- mkdir -p input config
- > # Download input
wget --no-verbose -O input/phm-collection.tsv
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
- > # Download config
wget --no-verbose -O config/phm-transform.json
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"