🎨 restructure taskfiles

2025-05-18 00:00:43 +02:00 · 2021-02-23 17:11:59 +01:00 · 2021-02-23 17:11:59 +01:00 · 6789554c60
commit 6789554c60
parent 62eb0cddbf
6 changed files with 127 additions and 175 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,8 @@
 .task
 openrefine
 */output
+*/openrefine.log
+*/*.openrefine.tar.gz
 example-doaj/input
 example-doaj/config
 example-powerhouse/input
--- a/Taskfile.yml
+++ b/Taskfile.yml
@ -3,15 +3,9 @@
 version: '3'

 includes:
-  example-doaj:
-    taskfile: example-doaj
-    dir: example-doaj
-  example-duplicates:
-    taskfile: example-duplicates
-    dir: example-duplicates
-  example-powerhouse:
-    taskfile: example-powerhouse
-    dir: example-powerhouse
+  example-doaj: example-doaj
+  example-duplicates: example-duplicates
+  example-powerhouse: example-powerhouse
  # add your project here

 silent: true
@ -32,29 +26,35 @@ tasks:
    desc: (re)install OpenRefine and openrefine-client into subdirectory openrefine
    cmds:
      - | # delete existing install and recreate folder
-        rm -rf openrefine; mkdir -p openrefine
-      - | # install OpenRefine into subdirectory openrefine
-        wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
-        tar -xzf openrefine.tar.gz -C openrefine --strip 1 && rm openrefine.tar.gz
-      - sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file
-      - sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser
-      - sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
-      - | # install openrefine-client into subdirectory openrefine
-        wget --no-verbose -O openrefine/client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
-        chmod +x openrefine/client
+        rm -rf openrefine
+        mkdir -p openrefine
+      - > # download OpenRefine archive
+        wget --no-verbose -O openrefine.tar.gz
+        https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
+      - > # install OpenRefine into subdirectory openrefine
+        tar -xzf openrefine.tar.gz -C openrefine --strip 1
+        && rm openrefine.tar.gz
+      - | # optimize OpenRefine for batch processing
+        sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file
+        sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser
+        sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
+      - > # download openrefine-client into subdirectory openrefine
+        wget --no-verbose -O openrefine/client
+        https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
+        && chmod +x openrefine/client

  start:
-    dir: ./{{.PROJECT}}/output
+    dir: ./{{.PROJECT}}
    cmds:
      - | # check install and delete any temporary OpenRefine files
-        if [ ! -f "../../openrefine/refine" ]; then
+        if [ ! -f "../openrefine/refine" ]; then
          echo 1>&2 "OpenRefine missing; try task install"; exit 1
        fi
        rm -rf ./*.project* workspace.json
-      - | # launch OpenRefine with specific data directory and redirect its output to a log file
-        ../../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}} \
-          -d ../{{.PROJECT}}/output \
-          > openrefine.log 2>&1 & 
+      - > # launch OpenRefine with specific data directory and redirect its output to a log file
+        ../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}}
+        -d ../{{.PROJECT}}
+        > openrefine.log 2>&1 &
      - | # wait until OpenRefine API is available
        timeout 30s bash -c "until
          wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
@ -62,17 +62,18 @@ tasks:
        done"

  stop:
-    dir: ./{{.PROJECT}}/output
+    dir: ./{{.PROJECT}}
    cmds:
      - | # shut down OpenRefine
        PID=$(lsof -t -i:{{.PORT}})
        kill $PID
        while ps -p $PID > /dev/null; do sleep 1; done
-      - | # archive the OpenRefine project
-        tar cfz \
-          {{.PROJECT}}.openrefine.tar.gz \
-          -C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1) \
-          .
+      - > # archive the OpenRefine project and delete temporary files
+        tar cfz
+        {{.PROJECT}}.openrefine.tar.gz
+        -C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1)
+        .
+        && rm -rf ./*.project* workspace.json

  check:
    desc: check OpenRefine log for any warnings and exit on error
--- a/example-doaj/Taskfile.yml
+++ b/example-doaj/Taskfile.yml
@ -9,68 +9,53 @@ tasks:
        vars: {PROJECT: '{{splitList ":" .TASK | first}}'}

  refine:
+    dir: ./{{.PROJECT}}
    vars:
      PORT: 3335 # assign a different port for each project
      RAM: 2048M # maximum RAM for OpenRefine java heap space
      PROJECT: '{{splitList ":" .TASK | first}}'
    deps: # will be executed each run independent of up-to-date check
      - task: download
-    cmds: # tasks prepended with ":" are defined in Taskfile.yml
-      - task: :start
+    cmds:
+      - task: :start # launch OpenRefine
        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
-      - task: import
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: apply
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: export
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: stats
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: :stop
+      - > # import file
+        ../openrefine/client -P {{.PORT}}
+        --create "$(readlink -m input/doaj-article-sample.csv)"
+        --projectName {{.PROJECT}}
+      - > # apply transformation rules
+        ../openrefine/client -P {{.PORT}} {{.PROJECT}}
+        --apply config/doaj-openrefine.json
+      - > # export to file
+        mkdir -p output &&
+        ../openrefine/client -P {{.PORT}} {{.PROJECT}}
+        --output "$(readlink -m output/doaj-results.tsv)"
+      - | # print allocated system resources
+        PID="$(lsof -t -i:{{.PORT}})"
+        echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
+        echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
+      - task: :stop # shut down OpenRefine and archive the OpenRefine project
        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
    sources:
      - input/**
      - config/**
    generates:
-      - output/openrefine.log
-      - output/{{.PROJECT}}.openrefine.tar.gz
-    ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
+      - openrefine.log
+      - ./{{.PROJECT}}.openrefine.tar.gz
+      - output/**
+    ignore_error: true # workaround to avoid an orphaned Java process on error
+                       # https://github.com/go-task/task/issues/141

  download:
+    dir: '{{splitList ":" .TASK | first}}'
    cmds:
      - mkdir -p input config
-      - wget --no-verbose -O input/doaj-article-sample.csv https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
-      - wget --no-verbose -O config/doaj-openrefine.json https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
-
-  import:
-    dir: input
-    cmds:
-      - | # import file
-        ../../openrefine/client -P {{.PORT}} \
-        --create doaj-article-sample.csv \
-        --projectName {{.PROJECT}}
-    ignore_error: true # workaround
-
-  apply:
-    dir: config
-    cmds:
-      - | # apply transformation rules
-        ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
-        --apply doaj-openrefine.json
-    ignore_error: true # workaround
-
-  export:
-    dir: output
-    cmds:
-      - | # export to file; use readlink to log full path to output file
-        ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
-        --output "$(readlink -m doaj-results.tsv)"
-    ignore_error: true # workaround
-
-  stats:
-    cmds:
-      - ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
-    ignore_error: true # workaround
+      - > # Download input
+        wget --no-verbose -O input/doaj-article-sample.csv
+        https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
+      - > # Download config
+        wget --no-verbose -O config/doaj-openrefine.json
+        https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json

  default: # enable standalone execution (running `task` in project directory)
    cmds:
--- a/example-duplicates/Taskfile.yml
+++ b/example-duplicates/Taskfile.yml
@ -9,61 +9,41 @@ tasks:
        vars: {PROJECT: '{{splitList ":" .TASK | first}}'}

  refine:
+    dir: ./{{.PROJECT}}
    vars:
      PORT: 3334 # assign a different port for each project
      RAM: 2048M # maximum RAM for OpenRefine java heap space
      PROJECT: '{{splitList ":" .TASK | first}}'
-    cmds: # tasks prepended with ":" are defined in Taskfile.yml
-      - task: :start
+    cmds:
+      - task: :start # launch OpenRefine
        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
-      - task: import
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: apply
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: export
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: stats
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: :stop
+      - > # import file
+        ../openrefine/client -P {{.PORT}}
+        --create "$(readlink -m input/duplicates.csv)"
+        --encoding UTF-8
+        --projectName {{.PROJECT}}
+      - > # apply transformation rules
+        ../openrefine/client -P {{.PORT}} {{.PROJECT}}
+        --apply config/duplicates-deletion.json
+      - > # export to file
+        mkdir -p output &&
+        ../openrefine/client -P {{.PORT}} {{.PROJECT}}
+        --output "$(readlink -m output/deduped.xls)"
+      - | # print allocated system resources
+        PID="$(lsof -t -i:{{.PORT}})"
+        echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
+        echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
+      - task: :stop # shut down OpenRefine and archive the OpenRefine project
        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
    sources:
      - input/**
      - config/**
    generates:
-      - output/openrefine.log
-      - output/{{.PROJECT}}.openrefine.tar.gz
-    ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
-
-  import:
-    dir: input
-    cmds:
-      - | # import file
-        ../../openrefine/client -P {{.PORT}} \
-        --create duplicates.csv \
-        --encoding UTF-8 \
-        --projectName {{.PROJECT}}
-    ignore_error: true # workaround
-
-  apply:
-    dir: config
-    cmds:
-      - | # apply transformation rules
-        ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
-        --apply duplicates-deletion.json
-    ignore_error: true # workaround
-
-  export:
-    dir: output
-    cmds:
-      - | # export to file; use readlink to log full path to output file
-        ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
-        --output "$(readlink -m deduped.xls)"
-    ignore_error: true # workaround
-
-  stats:
-    cmds:
-      - ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
-    ignore_error: true # workaround
+      - openrefine.log
+      - ./{{.PROJECT}}.openrefine.tar.gz
+      - output/**
+    ignore_error: true # workaround to avoid an orphaned Java process on error
+                       # https://github.com/go-task/task/issues/141

  default: # enable standalone execution (running `task` in project directory)
    cmds:
--- a/example-powerhouse/Taskfile.yml
+++ b/example-powerhouse/Taskfile.yml
@ -9,70 +9,54 @@ tasks:
        vars: {PROJECT: '{{splitList ":" .TASK | first}}'}

  refine:
+    dir: ./{{.PROJECT}}
    vars:
      PORT: 3336 # assign a different port for each project
      RAM: 2048M # maximum RAM for OpenRefine java heap space
      PROJECT: '{{splitList ":" .TASK | first}}'
    deps: # will be executed each run independent of up-to-date check
      - task: download
-    cmds: # tasks prepended with ":" are defined in Taskfile.yml
-      - task: :start
+    cmds:
+      - task: :start # launch OpenRefine
        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
-      - task: import
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: apply
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: export
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: stats
-        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
-      - task: :stop
+      - > # import file
+        ../openrefine/client -P {{.PORT}}
+        --create "$(readlink -m input/phm-collection.tsv)"
+        --processQuotes false
+        --guessCellValueTypes true
+        --projectName {{.PROJECT}}
+      - > # apply transformation rules
+        ../openrefine/client -P {{.PORT}} {{.PROJECT}}
+        --apply config/phm-transform.json
+      - > # export to file
+        mkdir -p output &&
+        ../openrefine/client -P {{.PORT}} {{.PROJECT}}
+        --output "$(readlink -m output/phm-results.tsv)"
+      - | # print allocated system resources
+        PID="$(lsof -t -i:{{.PORT}})"
+        echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
+        echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
+      - task: :stop # shut down OpenRefine and archive the OpenRefine project
        vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
    sources:
      - input/**
      - config/**
    generates:
-      - output/openrefine.log
-      - output/{{.PROJECT}}.openrefine.tar.gz
-    ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
-
+      - openrefine.log
+      - ./{{.PROJECT}}.openrefine.tar.gz
+      - output/**
+    ignore_error: true # workaround to avoid an orphaned Java process on error
+                       # https://github.com/go-task/task/issues/141
  download:
+    dir: '{{splitList ":" .TASK | first}}'
    cmds:
      - mkdir -p input config
-      - wget --no-verbose -O input/phm-collection.tsv https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
-      - wget --no-verbose -O config/phm-transform.json https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
-
-  import:
-    dir: input
-    cmds:
-      - | # import file
-        ../../openrefine/client -P {{.PORT}} \
-        --create phm-collection.tsv \
-        --processQuotes false \
-        --guessCellValueTypes true \
-        --projectName {{.PROJECT}}
-    ignore_error: true # workaround
-
-  apply:
-    dir: config
-    cmds:
-      - | # apply transformation rules
-        ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
-        --apply phm-transform.json
-    ignore_error: true # workaround
-
-  export:
-    dir: output
-    cmds:
-      - | # export to file; use readlink to log full path to output file
-        ../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
-        --output "$(readlink -m phm-results.tsv)"
-    ignore_error: true # workaround
-
-  stats:
-    cmds:
-      - ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
-    ignore_error: true # workaround
+      - > # Download input
+        wget --no-verbose -O input/phm-collection.tsv
+        https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
+      - > # Download config
+        wget --no-verbose -O config/phm-transform.json
+        https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json

  default: # enable standalone execution (running `task` in project directory)
    cmds: