🎨 restructure taskfiles

This commit is contained in:
Felix Lohmeier 2021-02-23 17:11:59 +01:00
parent 62eb0cddbf
commit 6789554c60
6 changed files with 127 additions and 175 deletions

2
.gitignore vendored
View File

@ -1,6 +1,8 @@
.task .task
openrefine openrefine
*/output */output
*/openrefine.log
*/*.openrefine.tar.gz
example-doaj/input example-doaj/input
example-doaj/config example-doaj/config
example-powerhouse/input example-powerhouse/input

View File

@ -139,4 +139,4 @@ Please file an [issue](https://github.com/opencultureconsulting/openrefine-task-
- [ ] how-to for extracting input options from OpenRefine GUI (via metadata in open project) - [ ] how-to for extracting input options from OpenRefine GUI (via metadata in open project)
- [ ] document known issues, e.g. [import xls, xlsx, ods](https://github.com/opencultureconsulting/openrefine-client/issues/4) - [ ] document known issues, e.g. [import xls, xlsx, ods](https://github.com/opencultureconsulting/openrefine-client/issues/4)
- [ ] add Binder files and badge - [ ] add Binder files and badge
- [ ] add example notebooks (links to nbviewer and Binder) - [ ] add example notebooks (links to nbviewer and Binder)

View File

@ -3,15 +3,9 @@
version: '3' version: '3'
includes: includes:
example-doaj: example-doaj: example-doaj
taskfile: example-doaj example-duplicates: example-duplicates
dir: example-doaj example-powerhouse: example-powerhouse
example-duplicates:
taskfile: example-duplicates
dir: example-duplicates
example-powerhouse:
taskfile: example-powerhouse
dir: example-powerhouse
# add your project here # add your project here
silent: true silent: true
@ -32,29 +26,35 @@ tasks:
desc: (re)install OpenRefine and openrefine-client into subdirectory openrefine desc: (re)install OpenRefine and openrefine-client into subdirectory openrefine
cmds: cmds:
- | # delete existing install and recreate folder - | # delete existing install and recreate folder
rm -rf openrefine; mkdir -p openrefine rm -rf openrefine
- | # install OpenRefine into subdirectory openrefine mkdir -p openrefine
wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz - > # download OpenRefine archive
tar -xzf openrefine.tar.gz -C openrefine --strip 1 && rm openrefine.tar.gz wget --no-verbose -O openrefine.tar.gz
- sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
- sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser - > # install OpenRefine into subdirectory openrefine
- sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours tar -xzf openrefine.tar.gz -C openrefine --strip 1
- | # install openrefine-client into subdirectory openrefine && rm openrefine.tar.gz
wget --no-verbose -O openrefine/client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux - | # optimize OpenRefine for batch processing
chmod +x openrefine/client sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
- > # download openrefine-client into subdirectory openrefine
wget --no-verbose -O openrefine/client
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
&& chmod +x openrefine/client
start: start:
dir: ./{{.PROJECT}}/output dir: ./{{.PROJECT}}
cmds: cmds:
- | # check install and delete any temporary OpenRefine files - | # check install and delete any temporary OpenRefine files
if [ ! -f "../../openrefine/refine" ]; then if [ ! -f "../openrefine/refine" ]; then
echo 1>&2 "OpenRefine missing; try task install"; exit 1 echo 1>&2 "OpenRefine missing; try task install"; exit 1
fi fi
rm -rf ./*.project* workspace.json rm -rf ./*.project* workspace.json
- | # launch OpenRefine with specific data directory and redirect its output to a log file - > # launch OpenRefine with specific data directory and redirect its output to a log file
../../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}} \ ../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}}
-d ../{{.PROJECT}}/output \ -d ../{{.PROJECT}}
> openrefine.log 2>&1 & > openrefine.log 2>&1 &
- | # wait until OpenRefine API is available - | # wait until OpenRefine API is available
timeout 30s bash -c "until timeout 30s bash -c "until
wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
@ -62,17 +62,18 @@ tasks:
done" done"
stop: stop:
dir: ./{{.PROJECT}}/output dir: ./{{.PROJECT}}
cmds: cmds:
- | # shut down OpenRefine - | # shut down OpenRefine
PID=$(lsof -t -i:{{.PORT}}) PID=$(lsof -t -i:{{.PORT}})
kill $PID kill $PID
while ps -p $PID > /dev/null; do sleep 1; done while ps -p $PID > /dev/null; do sleep 1; done
- | # archive the OpenRefine project - > # archive the OpenRefine project and delete temporary files
tar cfz \ tar cfz
{{.PROJECT}}.openrefine.tar.gz \ {{.PROJECT}}.openrefine.tar.gz
-C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1) \ -C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1)
. .
&& rm -rf ./*.project* workspace.json
check: check:
desc: check OpenRefine log for any warnings and exit on error desc: check OpenRefine log for any warnings and exit on error

View File

@ -6,71 +6,56 @@ tasks:
cmds: cmds:
- task: refine - task: refine
- task: :check # check OpenRefine log for any warnings and exit on error - task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'} vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine: refine:
dir: ./{{.PROJECT}}
vars: vars:
PORT: 3335 # assign a different port for each project PORT: 3335 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}' PROJECT: '{{splitList ":" .TASK | first}}'
deps: # will be executed each run independent of up-to-date check deps: # will be executed each run independent of up-to-date check
- task: download - task: download
cmds: # tasks prepended with ":" are defined in Taskfile.yml cmds:
- task: :start - task: :start # launch OpenRefine
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import - > # import file
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} ../openrefine/client -P {{.PORT}}
- task: apply --create "$(readlink -m input/doaj-article-sample.csv)"
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} --projectName {{.PROJECT}}
- task: export - > # apply transformation rules
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} ../openrefine/client -P {{.PORT}} {{.PROJECT}}
- task: stats --apply config/doaj-openrefine.json
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - > # export to file
- task: :stop mkdir -p output &&
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/doaj-results.tsv)"
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources: sources:
- input/** - input/**
- config/** - config/**
generates: generates:
- output/openrefine.log - openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz - ./{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 - output/**
ignore_error: true # workaround to avoid an orphaned Java process on error
# https://github.com/go-task/task/issues/141
download: download:
dir: '{{splitList ":" .TASK | first}}'
cmds: cmds:
- mkdir -p input config - mkdir -p input config
- wget --no-verbose -O input/doaj-article-sample.csv https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv - > # Download input
- wget --no-verbose -O config/doaj-openrefine.json https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json wget --no-verbose -O input/doaj-article-sample.csv
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
import: - > # Download config
dir: input wget --no-verbose -O config/doaj-openrefine.json
cmds: https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
- | # import file
../../openrefine/client -P {{.PORT}} \
--create doaj-article-sample.csv \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply doaj-openrefine.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m doaj-results.tsv)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
default: # enable standalone execution (running `task` in project directory) default: # enable standalone execution (running `task` in project directory)
cmds: cmds:

View File

@ -6,64 +6,44 @@ tasks:
cmds: cmds:
- task: refine - task: refine
- task: :check # check OpenRefine log for any warnings and exit on error - task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'} vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine: refine:
dir: ./{{.PROJECT}}
vars: vars:
PORT: 3334 # assign a different port for each project PORT: 3334 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}' PROJECT: '{{splitList ":" .TASK | first}}'
cmds: # tasks prepended with ":" are defined in Taskfile.yml cmds:
- task: :start - task: :start # launch OpenRefine
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import - > # import file
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} ../openrefine/client -P {{.PORT}}
- task: apply --create "$(readlink -m input/duplicates.csv)"
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} --encoding UTF-8
- task: export --projectName {{.PROJECT}}
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} - > # apply transformation rules
- task: stats ../openrefine/client -P {{.PORT}} {{.PROJECT}}
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} --apply config/duplicates-deletion.json
- task: :stop - > # export to file
mkdir -p output &&
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/deduped.xls)"
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources: sources:
- input/** - input/**
- config/** - config/**
generates: generates:
- output/openrefine.log - openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz - ./{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 - output/**
ignore_error: true # workaround to avoid an orphaned Java process on error
import: # https://github.com/go-task/task/issues/141
dir: input
cmds:
- | # import file
../../openrefine/client -P {{.PORT}} \
--create duplicates.csv \
--encoding UTF-8 \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply duplicates-deletion.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m deduped.xls)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
default: # enable standalone execution (running `task` in project directory) default: # enable standalone execution (running `task` in project directory)
cmds: cmds:

View File

@ -6,73 +6,57 @@ tasks:
cmds: cmds:
- task: refine - task: refine
- task: :check # check OpenRefine log for any warnings and exit on error - task: :check # check OpenRefine log for any warnings and exit on error
vars: {PROJECT: '{{splitList ":" .TASK | first}}'} vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
refine: refine:
dir: ./{{.PROJECT}}
vars: vars:
PORT: 3336 # assign a different port for each project PORT: 3336 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space RAM: 2048M # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}' PROJECT: '{{splitList ":" .TASK | first}}'
deps: # will be executed each run independent of up-to-date check deps: # will be executed each run independent of up-to-date check
- task: download - task: download
cmds: # tasks prepended with ":" are defined in Taskfile.yml cmds:
- task: :start - task: :start # launch OpenRefine
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- task: import - > # import file
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} ../openrefine/client -P {{.PORT}}
- task: apply --create "$(readlink -m input/phm-collection.tsv)"
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} --processQuotes false
- task: export --guessCellValueTypes true
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} --projectName {{.PROJECT}}
- task: stats - > # apply transformation rules
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} ../openrefine/client -P {{.PORT}} {{.PROJECT}}
- task: :stop --apply config/phm-transform.json
- > # export to file
mkdir -p output &&
../openrefine/client -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/phm-results.tsv)"
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources: sources:
- input/** - input/**
- config/** - config/**
generates: generates:
- output/openrefine.log - openrefine.log
- output/{{.PROJECT}}.openrefine.tar.gz - ./{{.PROJECT}}.openrefine.tar.gz
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 - output/**
ignore_error: true # workaround to avoid an orphaned Java process on error
# https://github.com/go-task/task/issues/141
download: download:
dir: '{{splitList ":" .TASK | first}}'
cmds: cmds:
- mkdir -p input config - mkdir -p input config
- wget --no-verbose -O input/phm-collection.tsv https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv - > # Download input
- wget --no-verbose -O config/phm-transform.json https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json wget --no-verbose -O input/phm-collection.tsv
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
import: - > # Download config
dir: input wget --no-verbose -O config/phm-transform.json
cmds: https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
- | # import file
../../openrefine/client -P {{.PORT}} \
--create phm-collection.tsv \
--processQuotes false \
--guessCellValueTypes true \
--projectName {{.PROJECT}}
ignore_error: true # workaround
apply:
dir: config
cmds:
- | # apply transformation rules
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--apply phm-transform.json
ignore_error: true # workaround
export:
dir: output
cmds:
- | # export to file; use readlink to log full path to output file
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
--output "$(readlink -m phm-results.tsv)"
ignore_error: true # workaround
stats:
cmds:
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
ignore_error: true # workaround
default: # enable standalone execution (running `task` in project directory) default: # enable standalone execution (running `task` in project directory)
cmds: cmds: