From 498a261dd7e5520ec818de7b1b60c436eaac83d6 Mon Sep 17 00:00:00 2001 From: Felix Lohmeier Date: Tue, 23 Feb 2021 22:45:03 +0100 Subject: [PATCH] :truck: add flexibility by separating dir and project vars --- README.md | 1 + Taskfile.yml | 28 +++++++++++++++++----------- example-doaj/Taskfile.yml | 23 +++++++++++++---------- example-duplicates/Taskfile.yml | 20 +++++++++++--------- example-powerhouse/Taskfile.yml | 21 ++++++++++++--------- 5 files changed, 54 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 6c827f1..a427fd4 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,7 @@ Please file an [issue](https://github.com/opencultureconsulting/openrefine-task- - [ ] example for download "fresh" input data as a dependent task and generating archives/diffs - [ ] example for applying multiple json files - [ ] example for templating xml and validation with xmllint + - [ ] example for multiple projects in one directory/taskfile - [ ] describe example datasets (and differences) with source code examples - [ ] elaborate how-to for developing tasks - [ ] document openrefine-client options and defaults (tables for input and output with file-format-specific defaults) including templating diff --git a/Taskfile.yml b/Taskfile.yml index fe686ca..d209c66 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -1,4 +1,4 @@ -# https://github.com/opencultureconsulting/openrefine-tasks +# https://github.com/opencultureconsulting/openrefine-task-runner version: '3' @@ -6,11 +6,17 @@ includes: example-doaj: example-doaj example-duplicates: example-duplicates example-powerhouse: example-powerhouse - # add your project here + # add the directory name of your project here silent: true output: prefixed +env: + OPENREFINE: + sh: readlink -m openrefine/refine + CLIENT: + sh: readlink -m openrefine/client + tasks: default: desc: execute all projects in parallel @@ -18,7 +24,7 @@ tasks: - task: example-doaj:refine - task: example-duplicates:refine - task: example-powerhouse:refine - # add your project here + # add the directory name of your project here cmds: - task: check @@ -44,16 +50,16 @@ tasks: && chmod +x openrefine/client start: - dir: ./{{.PROJECT}} + dir: ./{{.DIR}} cmds: - | # check install and delete any temporary OpenRefine files - if [ ! -f "../openrefine/refine" ]; then + if [ ! -f "$OPENREFINE" ]; then echo 1>&2 "OpenRefine missing; try task install"; exit 1 fi rm -rf ./*.project* workspace.json openrefine.log - > # launch OpenRefine with specific data directory and redirect its output to a log file - ../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}} - -d ../{{.PROJECT}} + "$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}} + -d ../{{.DIR}} >> openrefine.log 2>&1 & - | # wait until OpenRefine API is available timeout 30s bash -c "until @@ -62,7 +68,7 @@ tasks: done" stop: - dir: ./{{.PROJECT}} + dir: ./{{.DIR}} cmds: - | # shut down OpenRefine PID=$(lsof -t -i:{{.PORT}}) @@ -70,14 +76,14 @@ tasks: while ps -p $PID > /dev/null; do sleep 1; done - > # archive the OpenRefine project and delete temporary files tar cfz - {{.PROJECT}}.openrefine.tar.gz - -C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1) + "{{.PROJECT}}.openrefine.tar.gz" + -C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1) . && rm -rf ./*.project* workspace.json check: desc: check OpenRefine log for any warnings and exit on error - dir: ./{{.PROJECT}} + dir: ./{{.DIR}} cmds: - | # find log file(s) and check for "exception" or "error" if grep -i 'exception\|error' $(find . -name openrefine.log); then diff --git a/example-doaj/Taskfile.yml b/example-doaj/Taskfile.yml index a03ede8..a3f411f 100644 --- a/example-doaj/Taskfile.yml +++ b/example-doaj/Taskfile.yml @@ -3,34 +3,36 @@ version: '3' tasks: main: desc: Library Carpentry Lesson covering DOAJ + vars: {DIR: '{{splitList ":" .TASK | first}}'} cmds: - task: refine - task: :check # check OpenRefine log for any warnings and exit on error - vars: {PROJECT: '{{splitList ":" .TASK | first}}'} + vars: {DIR: '{{.DIR}}'} refine: - dir: ./{{.PROJECT}} + dir: ./{{.DIR}} vars: - PORT: 3335 # assign a different port for each project + PORT: 3334 # assign a different port for each project RAM: 2048M # maximum RAM for OpenRefine java heap space + DIR: '{{splitList ":" .TASK | first}}' PROJECT: '{{splitList ":" .TASK | first}}' deps: # will be executed each run independent of up-to-date check - task: download cmds: - task: :start # launch OpenRefine - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} + vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - > # import file - ../openrefine/client -P {{.PORT}} + "$CLIENT" -P {{.PORT}} --create "$(readlink -m input/doaj-article-sample.csv)" --projectName {{.PROJECT}} > >(tee -a openrefine.log) 2>&1 - > # apply transformation rules - ../openrefine/client -P {{.PORT}} {{.PROJECT}} + "$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/doaj-openrefine.json > >(tee -a openrefine.log) 2>&1 - > # export to file mkdir -p output && - ../openrefine/client -P {{.PORT}} {{.PROJECT}} + "$CLIENT" -P {{.PORT}} {{.PROJECT}} --output "$(readlink -m output/doaj-results.tsv)" > >(tee -a openrefine.log) 2>&1 - | # print allocated system resources @@ -40,7 +42,7 @@ tasks: echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \ > >(tee -a openrefine.log) - task: :stop # shut down OpenRefine and archive the OpenRefine project - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} + vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - input/** - config/** @@ -52,7 +54,8 @@ tasks: # https://github.com/go-task/task/issues/141 download: - dir: '{{splitList ":" .TASK | first}}' + dir: ./{{.DIR}} + vars: {DIR: '{{splitList ":" .TASK | first}}'} cmds: - mkdir -p input config - > # Download input @@ -64,4 +67,4 @@ tasks: default: # enable standalone execution (running `task` in project directory) cmds: - - PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT" + - DIR="${PWD##*/}:main" && cd .. && task "$DIR" diff --git a/example-duplicates/Taskfile.yml b/example-duplicates/Taskfile.yml index ae838c5..60a31fe 100644 --- a/example-duplicates/Taskfile.yml +++ b/example-duplicates/Taskfile.yml @@ -3,33 +3,35 @@ version: '3' tasks: main: desc: Removing duplicates in a very small test dataset + vars: {DIR: '{{splitList ":" .TASK | first}}'} cmds: - task: refine - task: :check # check OpenRefine log for any warnings and exit on error - vars: {PROJECT: '{{splitList ":" .TASK | first}}'} + vars: {DIR: '{{.DIR}}'} refine: - dir: ./{{.PROJECT}} + dir: ./{{.DIR}} vars: - PORT: 3334 # assign a different port for each project + PORT: 3335 # assign a different port for each project RAM: 2048M # maximum RAM for OpenRefine java heap space + DIR: '{{splitList ":" .TASK | first}}' PROJECT: '{{splitList ":" .TASK | first}}' cmds: - task: :start # launch OpenRefine - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} + vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - > # import file - ../openrefine/client -P {{.PORT}} + "$CLIENT" -P {{.PORT}} --create "$(readlink -m input/duplicates.csv)" --encoding UTF-8 --projectName {{.PROJECT}} > >(tee -a openrefine.log) 2>&1 - > # apply transformation rules - ../openrefine/client -P {{.PORT}} {{.PROJECT}} + "$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/duplicates-deletion.json > >(tee -a openrefine.log) 2>&1 - > # export to file mkdir -p output && - ../openrefine/client -P {{.PORT}} {{.PROJECT}} + "$CLIENT" -P {{.PORT}} {{.PROJECT}} --output "$(readlink -m output/deduped.xls)" > >(tee -a openrefine.log) 2>&1 - | # print allocated system resources @@ -39,7 +41,7 @@ tasks: echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \ > >(tee -a openrefine.log) - task: :stop # shut down OpenRefine and archive the OpenRefine project - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} + vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - input/** - config/** @@ -52,4 +54,4 @@ tasks: default: # enable standalone execution (running `task` in project directory) cmds: - - PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT" + - DIR="${PWD##*/}:main" && cd .. && task "$DIR" diff --git a/example-powerhouse/Taskfile.yml b/example-powerhouse/Taskfile.yml index ba528a7..643f35d 100644 --- a/example-powerhouse/Taskfile.yml +++ b/example-powerhouse/Taskfile.yml @@ -3,36 +3,38 @@ version: '3' tasks: main: desc: Powerhouse Museum Tutorial + vars: {DIR: '{{splitList ":" .TASK | first}}'} cmds: - task: refine - task: :check # check OpenRefine log for any warnings and exit on error - vars: {PROJECT: '{{splitList ":" .TASK | first}}'} + vars: {DIR: '{{.DIR}}'} refine: - dir: ./{{.PROJECT}} + dir: ./{{.DIR}} vars: PORT: 3336 # assign a different port for each project RAM: 2048M # maximum RAM for OpenRefine java heap space + DIR: '{{splitList ":" .TASK | first}}' PROJECT: '{{splitList ":" .TASK | first}}' deps: # will be executed each run independent of up-to-date check - task: download cmds: - task: :start # launch OpenRefine - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} + vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - > # import file - ../openrefine/client -P {{.PORT}} + "$CLIENT" -P {{.PORT}} --create "$(readlink -m input/phm-collection.tsv)" --processQuotes false --guessCellValueTypes true --projectName {{.PROJECT}} > >(tee -a openrefine.log) 2>&1 - > # apply transformation rules - ../openrefine/client -P {{.PORT}} {{.PROJECT}} + "$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/phm-transform.json > >(tee -a openrefine.log) 2>&1 - > # export to file mkdir -p output && - ../openrefine/client -P {{.PORT}} {{.PROJECT}} + "$CLIENT" -P {{.PORT}} {{.PROJECT}} --output "$(readlink -m output/phm-results.tsv)" > >(tee -a openrefine.log) 2>&1 - | # print allocated system resources @@ -42,7 +44,7 @@ tasks: echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \ > >(tee -a openrefine.log) - task: :stop # shut down OpenRefine and archive the OpenRefine project - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} + vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - input/** - config/** @@ -53,7 +55,8 @@ tasks: ignore_error: true # workaround to avoid an orphaned Java process on error # https://github.com/go-task/task/issues/141 download: - dir: '{{splitList ":" .TASK | first}}' + dir: ./{{.DIR}} + vars: {DIR: '{{splitList ":" .TASK | first}}'} cmds: - mkdir -p input config - > # Download input @@ -65,4 +68,4 @@ tasks: default: # enable standalone execution (running `task` in project directory) cmds: - - PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT" + - DIR="${PWD##*/}:main" && cd .. && task "$DIR"