2021-02-20 00:22:12 +01:00
|
|
|
version: '3'
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
main:
|
|
|
|
desc: Library Carpentry Lesson covering DOAJ
|
|
|
|
cmds:
|
|
|
|
- task: refine
|
|
|
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
2021-02-23 17:11:59 +01:00
|
|
|
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
2021-02-20 00:22:12 +01:00
|
|
|
|
|
|
|
refine:
|
2021-02-23 17:11:59 +01:00
|
|
|
dir: ./{{.PROJECT}}
|
2021-02-20 00:22:12 +01:00
|
|
|
vars:
|
|
|
|
PORT: 3335 # assign a different port for each project
|
|
|
|
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
|
|
|
PROJECT: '{{splitList ":" .TASK | first}}'
|
|
|
|
deps: # will be executed each run independent of up-to-date check
|
|
|
|
- task: download
|
2021-02-23 17:11:59 +01:00
|
|
|
cmds:
|
|
|
|
- task: :start # launch OpenRefine
|
2021-02-20 00:22:12 +01:00
|
|
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
2021-02-23 17:11:59 +01:00
|
|
|
- > # import file
|
|
|
|
../openrefine/client -P {{.PORT}}
|
|
|
|
--create "$(readlink -m input/doaj-article-sample.csv)"
|
|
|
|
--projectName {{.PROJECT}}
|
2021-02-23 17:22:27 +01:00
|
|
|
> >(tee -a openrefine.log) 2>&1
|
2021-02-23 17:11:59 +01:00
|
|
|
- > # apply transformation rules
|
|
|
|
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
|
|
|
--apply config/doaj-openrefine.json
|
2021-02-23 17:22:27 +01:00
|
|
|
> >(tee -a openrefine.log) 2>&1
|
2021-02-23 17:11:59 +01:00
|
|
|
- > # export to file
|
|
|
|
mkdir -p output &&
|
|
|
|
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
|
|
|
--output "$(readlink -m output/doaj-results.tsv)"
|
2021-02-23 17:22:27 +01:00
|
|
|
> >(tee -a openrefine.log) 2>&1
|
2021-02-23 17:11:59 +01:00
|
|
|
- | # print allocated system resources
|
|
|
|
PID="$(lsof -t -i:{{.PORT}})"
|
2021-02-23 17:22:27 +01:00
|
|
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \
|
|
|
|
> >(tee -a openrefine.log)
|
|
|
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
|
|
|
> >(tee -a openrefine.log)
|
2021-02-23 17:11:59 +01:00
|
|
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
2021-02-20 00:22:12 +01:00
|
|
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
|
|
|
sources:
|
|
|
|
- input/**
|
|
|
|
- config/**
|
|
|
|
generates:
|
2021-02-23 17:11:59 +01:00
|
|
|
- openrefine.log
|
|
|
|
- ./{{.PROJECT}}.openrefine.tar.gz
|
|
|
|
- output/**
|
|
|
|
ignore_error: true # workaround to avoid an orphaned Java process on error
|
|
|
|
# https://github.com/go-task/task/issues/141
|
2021-02-20 00:22:12 +01:00
|
|
|
|
|
|
|
download:
|
2021-02-23 17:11:59 +01:00
|
|
|
dir: '{{splitList ":" .TASK | first}}'
|
2021-02-20 00:22:12 +01:00
|
|
|
cmds:
|
|
|
|
- mkdir -p input config
|
2021-02-23 17:11:59 +01:00
|
|
|
- > # Download input
|
|
|
|
wget --no-verbose -O input/doaj-article-sample.csv
|
|
|
|
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
|
|
|
|
- > # Download config
|
|
|
|
wget --no-verbose -O config/doaj-openrefine.json
|
|
|
|
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
|
2021-02-20 00:22:12 +01:00
|
|
|
|
|
|
|
default: # enable standalone execution (running `task` in project directory)
|
|
|
|
cmds:
|
|
|
|
- PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"
|