From 6789554c6073fbeb219a2c7aa485b88ca454e465 Mon Sep 17 00:00:00 2001 From: Felix Lohmeier Date: Tue, 23 Feb 2021 17:11:59 +0100 Subject: [PATCH] :art: restructure taskfiles --- .gitignore | 2 + README.md | 2 +- Taskfile.yml | 63 +++++++++++++------------ example-doaj/Taskfile.yml | 79 +++++++++++++------------------ example-duplicates/Taskfile.yml | 72 ++++++++++------------------ example-powerhouse/Taskfile.yml | 84 +++++++++++++-------------------- 6 files changed, 127 insertions(+), 175 deletions(-) diff --git a/.gitignore b/.gitignore index d6184dc..d3e2c04 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ .task openrefine */output +*/openrefine.log +*/*.openrefine.tar.gz example-doaj/input example-doaj/config example-powerhouse/input diff --git a/README.md b/README.md index 19a65dc..e86fb65 100644 --- a/README.md +++ b/README.md @@ -139,4 +139,4 @@ Please file an [issue](https://github.com/opencultureconsulting/openrefine-task- - [ ] how-to for extracting input options from OpenRefine GUI (via metadata in open project) - [ ] document known issues, e.g. [import xls, xlsx, ods](https://github.com/opencultureconsulting/openrefine-client/issues/4) - [ ] add Binder files and badge -- [ ] add example notebooks (links to nbviewer and Binder) \ No newline at end of file +- [ ] add example notebooks (links to nbviewer and Binder) diff --git a/Taskfile.yml b/Taskfile.yml index 1713698..473383c 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -3,15 +3,9 @@ version: '3' includes: - example-doaj: - taskfile: example-doaj - dir: example-doaj - example-duplicates: - taskfile: example-duplicates - dir: example-duplicates - example-powerhouse: - taskfile: example-powerhouse - dir: example-powerhouse + example-doaj: example-doaj + example-duplicates: example-duplicates + example-powerhouse: example-powerhouse # add your project here silent: true @@ -32,29 +26,35 @@ tasks: desc: (re)install OpenRefine and openrefine-client into subdirectory openrefine cmds: - | # delete existing install and recreate folder - rm -rf openrefine; mkdir -p openrefine - - | # install OpenRefine into subdirectory openrefine - wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz - tar -xzf openrefine.tar.gz -C openrefine --strip 1 && rm openrefine.tar.gz - - sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file - - sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser - - sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours - - | # install openrefine-client into subdirectory openrefine - wget --no-verbose -O openrefine/client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux - chmod +x openrefine/client + rm -rf openrefine + mkdir -p openrefine + - > # download OpenRefine archive + wget --no-verbose -O openrefine.tar.gz + https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz + - > # install OpenRefine into subdirectory openrefine + tar -xzf openrefine.tar.gz -C openrefine --strip 1 + && rm openrefine.tar.gz + - | # optimize OpenRefine for batch processing + sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file + sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser + sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours + - > # download openrefine-client into subdirectory openrefine + wget --no-verbose -O openrefine/client + https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux + && chmod +x openrefine/client start: - dir: ./{{.PROJECT}}/output + dir: ./{{.PROJECT}} cmds: - | # check install and delete any temporary OpenRefine files - if [ ! -f "../../openrefine/refine" ]; then + if [ ! -f "../openrefine/refine" ]; then echo 1>&2 "OpenRefine missing; try task install"; exit 1 fi rm -rf ./*.project* workspace.json - - | # launch OpenRefine with specific data directory and redirect its output to a log file - ../../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}} \ - -d ../{{.PROJECT}}/output \ - > openrefine.log 2>&1 & + - > # launch OpenRefine with specific data directory and redirect its output to a log file + ../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}} + -d ../{{.PROJECT}} + > openrefine.log 2>&1 & - | # wait until OpenRefine API is available timeout 30s bash -c "until wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine @@ -62,17 +62,18 @@ tasks: done" stop: - dir: ./{{.PROJECT}}/output + dir: ./{{.PROJECT}} cmds: - | # shut down OpenRefine PID=$(lsof -t -i:{{.PORT}}) kill $PID while ps -p $PID > /dev/null; do sleep 1; done - - | # archive the OpenRefine project - tar cfz \ - {{.PROJECT}}.openrefine.tar.gz \ - -C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1) \ - . + - > # archive the OpenRefine project and delete temporary files + tar cfz + {{.PROJECT}}.openrefine.tar.gz + -C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1) + . + && rm -rf ./*.project* workspace.json check: desc: check OpenRefine log for any warnings and exit on error diff --git a/example-doaj/Taskfile.yml b/example-doaj/Taskfile.yml index 679642c..c47dae6 100644 --- a/example-doaj/Taskfile.yml +++ b/example-doaj/Taskfile.yml @@ -6,71 +6,56 @@ tasks: cmds: - task: refine - task: :check # check OpenRefine log for any warnings and exit on error - vars: {PROJECT: '{{splitList ":" .TASK | first}}'} + vars: {PROJECT: '{{splitList ":" .TASK | first}}'} refine: + dir: ./{{.PROJECT}} vars: PORT: 3335 # assign a different port for each project RAM: 2048M # maximum RAM for OpenRefine java heap space PROJECT: '{{splitList ":" .TASK | first}}' deps: # will be executed each run independent of up-to-date check - task: download - cmds: # tasks prepended with ":" are defined in Taskfile.yml - - task: :start + cmds: + - task: :start # launch OpenRefine vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - - task: import - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: apply - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: export - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: stats - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: :stop + - > # import file + ../openrefine/client -P {{.PORT}} + --create "$(readlink -m input/doaj-article-sample.csv)" + --projectName {{.PROJECT}} + - > # apply transformation rules + ../openrefine/client -P {{.PORT}} {{.PROJECT}} + --apply config/doaj-openrefine.json + - > # export to file + mkdir -p output && + ../openrefine/client -P {{.PORT}} {{.PROJECT}} + --output "$(readlink -m output/doaj-results.tsv)" + - | # print allocated system resources + PID="$(lsof -t -i:{{.PORT}})" + echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" + echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" + - task: :stop # shut down OpenRefine and archive the OpenRefine project vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - input/** - config/** generates: - - output/openrefine.log - - output/{{.PROJECT}}.openrefine.tar.gz - ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 + - openrefine.log + - ./{{.PROJECT}}.openrefine.tar.gz + - output/** + ignore_error: true # workaround to avoid an orphaned Java process on error + # https://github.com/go-task/task/issues/141 download: + dir: '{{splitList ":" .TASK | first}}' cmds: - mkdir -p input config - - wget --no-verbose -O input/doaj-article-sample.csv https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv - - wget --no-verbose -O config/doaj-openrefine.json https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json - - import: - dir: input - cmds: - - | # import file - ../../openrefine/client -P {{.PORT}} \ - --create doaj-article-sample.csv \ - --projectName {{.PROJECT}} - ignore_error: true # workaround - - apply: - dir: config - cmds: - - | # apply transformation rules - ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \ - --apply doaj-openrefine.json - ignore_error: true # workaround - - export: - dir: output - cmds: - - | # export to file; use readlink to log full path to output file - ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \ - --output "$(readlink -m doaj-results.tsv)" - ignore_error: true # workaround - - stats: - cmds: - - ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources - ignore_error: true # workaround + - > # Download input + wget --no-verbose -O input/doaj-article-sample.csv + https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv + - > # Download config + wget --no-verbose -O config/doaj-openrefine.json + https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json default: # enable standalone execution (running `task` in project directory) cmds: diff --git a/example-duplicates/Taskfile.yml b/example-duplicates/Taskfile.yml index fd6abb0..9016234 100644 --- a/example-duplicates/Taskfile.yml +++ b/example-duplicates/Taskfile.yml @@ -6,64 +6,44 @@ tasks: cmds: - task: refine - task: :check # check OpenRefine log for any warnings and exit on error - vars: {PROJECT: '{{splitList ":" .TASK | first}}'} + vars: {PROJECT: '{{splitList ":" .TASK | first}}'} refine: + dir: ./{{.PROJECT}} vars: PORT: 3334 # assign a different port for each project RAM: 2048M # maximum RAM for OpenRefine java heap space PROJECT: '{{splitList ":" .TASK | first}}' - cmds: # tasks prepended with ":" are defined in Taskfile.yml - - task: :start + cmds: + - task: :start # launch OpenRefine vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - - task: import - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: apply - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: export - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: stats - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: :stop + - > # import file + ../openrefine/client -P {{.PORT}} + --create "$(readlink -m input/duplicates.csv)" + --encoding UTF-8 + --projectName {{.PROJECT}} + - > # apply transformation rules + ../openrefine/client -P {{.PORT}} {{.PROJECT}} + --apply config/duplicates-deletion.json + - > # export to file + mkdir -p output && + ../openrefine/client -P {{.PORT}} {{.PROJECT}} + --output "$(readlink -m output/deduped.xls)" + - | # print allocated system resources + PID="$(lsof -t -i:{{.PORT}})" + echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" + echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" + - task: :stop # shut down OpenRefine and archive the OpenRefine project vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - input/** - config/** generates: - - output/openrefine.log - - output/{{.PROJECT}}.openrefine.tar.gz - ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 - - import: - dir: input - cmds: - - | # import file - ../../openrefine/client -P {{.PORT}} \ - --create duplicates.csv \ - --encoding UTF-8 \ - --projectName {{.PROJECT}} - ignore_error: true # workaround - - apply: - dir: config - cmds: - - | # apply transformation rules - ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \ - --apply duplicates-deletion.json - ignore_error: true # workaround - - export: - dir: output - cmds: - - | # export to file; use readlink to log full path to output file - ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \ - --output "$(readlink -m deduped.xls)" - ignore_error: true # workaround - - stats: - cmds: - - ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources - ignore_error: true # workaround + - openrefine.log + - ./{{.PROJECT}}.openrefine.tar.gz + - output/** + ignore_error: true # workaround to avoid an orphaned Java process on error + # https://github.com/go-task/task/issues/141 default: # enable standalone execution (running `task` in project directory) cmds: diff --git a/example-powerhouse/Taskfile.yml b/example-powerhouse/Taskfile.yml index 2b9191a..ffcb044 100644 --- a/example-powerhouse/Taskfile.yml +++ b/example-powerhouse/Taskfile.yml @@ -6,73 +6,57 @@ tasks: cmds: - task: refine - task: :check # check OpenRefine log for any warnings and exit on error - vars: {PROJECT: '{{splitList ":" .TASK | first}}'} + vars: {PROJECT: '{{splitList ":" .TASK | first}}'} refine: + dir: ./{{.PROJECT}} vars: PORT: 3336 # assign a different port for each project RAM: 2048M # maximum RAM for OpenRefine java heap space PROJECT: '{{splitList ":" .TASK | first}}' deps: # will be executed each run independent of up-to-date check - task: download - cmds: # tasks prepended with ":" are defined in Taskfile.yml - - task: :start + cmds: + - task: :start # launch OpenRefine vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} - - task: import - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: apply - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: export - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: stats - vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - - task: :stop + - > # import file + ../openrefine/client -P {{.PORT}} + --create "$(readlink -m input/phm-collection.tsv)" + --processQuotes false + --guessCellValueTypes true + --projectName {{.PROJECT}} + - > # apply transformation rules + ../openrefine/client -P {{.PORT}} {{.PROJECT}} + --apply config/phm-transform.json + - > # export to file + mkdir -p output && + ../openrefine/client -P {{.PORT}} {{.PROJECT}} + --output "$(readlink -m output/phm-results.tsv)" + - | # print allocated system resources + PID="$(lsof -t -i:{{.PORT}})" + echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" + echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" + - task: :stop # shut down OpenRefine and archive the OpenRefine project vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - input/** - config/** generates: - - output/openrefine.log - - output/{{.PROJECT}}.openrefine.tar.gz - ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 - + - openrefine.log + - ./{{.PROJECT}}.openrefine.tar.gz + - output/** + ignore_error: true # workaround to avoid an orphaned Java process on error + # https://github.com/go-task/task/issues/141 download: + dir: '{{splitList ":" .TASK | first}}' cmds: - mkdir -p input config - - wget --no-verbose -O input/phm-collection.tsv https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv - - wget --no-verbose -O config/phm-transform.json https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json - - import: - dir: input - cmds: - - | # import file - ../../openrefine/client -P {{.PORT}} \ - --create phm-collection.tsv \ - --processQuotes false \ - --guessCellValueTypes true \ - --projectName {{.PROJECT}} - ignore_error: true # workaround - - apply: - dir: config - cmds: - - | # apply transformation rules - ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \ - --apply phm-transform.json - ignore_error: true # workaround - - export: - dir: output - cmds: - - | # export to file; use readlink to log full path to output file - ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \ - --output "$(readlink -m phm-results.tsv)" - ignore_error: true # workaround - - stats: - cmds: - - ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources - ignore_error: true # workaround + - > # Download input + wget --no-verbose -O input/phm-collection.tsv + https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv + - > # Download config + wget --no-verbose -O config/phm-transform.json + https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json default: # enable standalone execution (running `task` in project directory) cmds: