🚚 add flexibility by separating dir and project vars
This commit is contained in:
parent
8f029531f5
commit
498a261dd7
|
@ -133,6 +133,7 @@ Please file an [issue](https://github.com/opencultureconsulting/openrefine-task-
|
||||||
- [ ] example for download "fresh" input data as a dependent task and generating archives/diffs
|
- [ ] example for download "fresh" input data as a dependent task and generating archives/diffs
|
||||||
- [ ] example for applying multiple json files
|
- [ ] example for applying multiple json files
|
||||||
- [ ] example for templating xml and validation with xmllint
|
- [ ] example for templating xml and validation with xmllint
|
||||||
|
- [ ] example for multiple projects in one directory/taskfile
|
||||||
- [ ] describe example datasets (and differences) with source code examples
|
- [ ] describe example datasets (and differences) with source code examples
|
||||||
- [ ] elaborate how-to for developing tasks
|
- [ ] elaborate how-to for developing tasks
|
||||||
- [ ] document openrefine-client options and defaults (tables for input and output with file-format-specific defaults) including templating
|
- [ ] document openrefine-client options and defaults (tables for input and output with file-format-specific defaults) including templating
|
||||||
|
|
28
Taskfile.yml
28
Taskfile.yml
|
@ -1,4 +1,4 @@
|
||||||
# https://github.com/opencultureconsulting/openrefine-tasks
|
# https://github.com/opencultureconsulting/openrefine-task-runner
|
||||||
|
|
||||||
version: '3'
|
version: '3'
|
||||||
|
|
||||||
|
@ -6,11 +6,17 @@ includes:
|
||||||
example-doaj: example-doaj
|
example-doaj: example-doaj
|
||||||
example-duplicates: example-duplicates
|
example-duplicates: example-duplicates
|
||||||
example-powerhouse: example-powerhouse
|
example-powerhouse: example-powerhouse
|
||||||
# add your project here
|
# add the directory name of your project here
|
||||||
|
|
||||||
silent: true
|
silent: true
|
||||||
output: prefixed
|
output: prefixed
|
||||||
|
|
||||||
|
env:
|
||||||
|
OPENREFINE:
|
||||||
|
sh: readlink -m openrefine/refine
|
||||||
|
CLIENT:
|
||||||
|
sh: readlink -m openrefine/client
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
default:
|
default:
|
||||||
desc: execute all projects in parallel
|
desc: execute all projects in parallel
|
||||||
|
@ -18,7 +24,7 @@ tasks:
|
||||||
- task: example-doaj:refine
|
- task: example-doaj:refine
|
||||||
- task: example-duplicates:refine
|
- task: example-duplicates:refine
|
||||||
- task: example-powerhouse:refine
|
- task: example-powerhouse:refine
|
||||||
# add your project here
|
# add the directory name of your project here
|
||||||
cmds:
|
cmds:
|
||||||
- task: check
|
- task: check
|
||||||
|
|
||||||
|
@ -44,16 +50,16 @@ tasks:
|
||||||
&& chmod +x openrefine/client
|
&& chmod +x openrefine/client
|
||||||
|
|
||||||
start:
|
start:
|
||||||
dir: ./{{.PROJECT}}
|
dir: ./{{.DIR}}
|
||||||
cmds:
|
cmds:
|
||||||
- | # check install and delete any temporary OpenRefine files
|
- | # check install and delete any temporary OpenRefine files
|
||||||
if [ ! -f "../openrefine/refine" ]; then
|
if [ ! -f "$OPENREFINE" ]; then
|
||||||
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
||||||
fi
|
fi
|
||||||
rm -rf ./*.project* workspace.json openrefine.log
|
rm -rf ./*.project* workspace.json openrefine.log
|
||||||
- > # launch OpenRefine with specific data directory and redirect its output to a log file
|
- > # launch OpenRefine with specific data directory and redirect its output to a log file
|
||||||
../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}}
|
"$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}}
|
||||||
-d ../{{.PROJECT}}
|
-d ../{{.DIR}}
|
||||||
>> openrefine.log 2>&1 &
|
>> openrefine.log 2>&1 &
|
||||||
- | # wait until OpenRefine API is available
|
- | # wait until OpenRefine API is available
|
||||||
timeout 30s bash -c "until
|
timeout 30s bash -c "until
|
||||||
|
@ -62,7 +68,7 @@ tasks:
|
||||||
done"
|
done"
|
||||||
|
|
||||||
stop:
|
stop:
|
||||||
dir: ./{{.PROJECT}}
|
dir: ./{{.DIR}}
|
||||||
cmds:
|
cmds:
|
||||||
- | # shut down OpenRefine
|
- | # shut down OpenRefine
|
||||||
PID=$(lsof -t -i:{{.PORT}})
|
PID=$(lsof -t -i:{{.PORT}})
|
||||||
|
@ -70,14 +76,14 @@ tasks:
|
||||||
while ps -p $PID > /dev/null; do sleep 1; done
|
while ps -p $PID > /dev/null; do sleep 1; done
|
||||||
- > # archive the OpenRefine project and delete temporary files
|
- > # archive the OpenRefine project and delete temporary files
|
||||||
tar cfz
|
tar cfz
|
||||||
{{.PROJECT}}.openrefine.tar.gz
|
"{{.PROJECT}}.openrefine.tar.gz"
|
||||||
-C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1)
|
-C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1)
|
||||||
.
|
.
|
||||||
&& rm -rf ./*.project* workspace.json
|
&& rm -rf ./*.project* workspace.json
|
||||||
|
|
||||||
check:
|
check:
|
||||||
desc: check OpenRefine log for any warnings and exit on error
|
desc: check OpenRefine log for any warnings and exit on error
|
||||||
dir: ./{{.PROJECT}}
|
dir: ./{{.DIR}}
|
||||||
cmds:
|
cmds:
|
||||||
- | # find log file(s) and check for "exception" or "error"
|
- | # find log file(s) and check for "exception" or "error"
|
||||||
if grep -i 'exception\|error' $(find . -name openrefine.log); then
|
if grep -i 'exception\|error' $(find . -name openrefine.log); then
|
||||||
|
|
|
@ -3,34 +3,36 @@ version: '3'
|
||||||
tasks:
|
tasks:
|
||||||
main:
|
main:
|
||||||
desc: Library Carpentry Lesson covering DOAJ
|
desc: Library Carpentry Lesson covering DOAJ
|
||||||
|
vars: {DIR: '{{splitList ":" .TASK | first}}'}
|
||||||
cmds:
|
cmds:
|
||||||
- task: refine
|
- task: refine
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
||||||
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
vars: {DIR: '{{.DIR}}'}
|
||||||
|
|
||||||
refine:
|
refine:
|
||||||
dir: ./{{.PROJECT}}
|
dir: ./{{.DIR}}
|
||||||
vars:
|
vars:
|
||||||
PORT: 3335 # assign a different port for each project
|
PORT: 3334 # assign a different port for each project
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
||||||
|
DIR: '{{splitList ":" .TASK | first}}'
|
||||||
PROJECT: '{{splitList ":" .TASK | first}}'
|
PROJECT: '{{splitList ":" .TASK | first}}'
|
||||||
deps: # will be executed each run independent of up-to-date check
|
deps: # will be executed each run independent of up-to-date check
|
||||||
- task: download
|
- task: download
|
||||||
cmds:
|
cmds:
|
||||||
- task: :start # launch OpenRefine
|
- task: :start # launch OpenRefine
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||||||
- > # import file
|
- > # import file
|
||||||
../openrefine/client -P {{.PORT}}
|
"$CLIENT" -P {{.PORT}}
|
||||||
--create "$(readlink -m input/doaj-article-sample.csv)"
|
--create "$(readlink -m input/doaj-article-sample.csv)"
|
||||||
--projectName {{.PROJECT}}
|
--projectName {{.PROJECT}}
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- > # apply transformation rules
|
- > # apply transformation rules
|
||||||
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
||||||
--apply config/doaj-openrefine.json
|
--apply config/doaj-openrefine.json
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- > # export to file
|
- > # export to file
|
||||||
mkdir -p output &&
|
mkdir -p output &&
|
||||||
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
||||||
--output "$(readlink -m output/doaj-results.tsv)"
|
--output "$(readlink -m output/doaj-results.tsv)"
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- | # print allocated system resources
|
- | # print allocated system resources
|
||||||
|
@ -40,7 +42,7 @@ tasks:
|
||||||
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
||||||
> >(tee -a openrefine.log)
|
> >(tee -a openrefine.log)
|
||||||
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||||||
sources:
|
sources:
|
||||||
- input/**
|
- input/**
|
||||||
- config/**
|
- config/**
|
||||||
|
@ -52,7 +54,8 @@ tasks:
|
||||||
# https://github.com/go-task/task/issues/141
|
# https://github.com/go-task/task/issues/141
|
||||||
|
|
||||||
download:
|
download:
|
||||||
dir: '{{splitList ":" .TASK | first}}'
|
dir: ./{{.DIR}}
|
||||||
|
vars: {DIR: '{{splitList ":" .TASK | first}}'}
|
||||||
cmds:
|
cmds:
|
||||||
- mkdir -p input config
|
- mkdir -p input config
|
||||||
- > # Download input
|
- > # Download input
|
||||||
|
@ -64,4 +67,4 @@ tasks:
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
default: # enable standalone execution (running `task` in project directory)
|
||||||
cmds:
|
cmds:
|
||||||
- PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"
|
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|
||||||
|
|
|
@ -3,33 +3,35 @@ version: '3'
|
||||||
tasks:
|
tasks:
|
||||||
main:
|
main:
|
||||||
desc: Removing duplicates in a very small test dataset
|
desc: Removing duplicates in a very small test dataset
|
||||||
|
vars: {DIR: '{{splitList ":" .TASK | first}}'}
|
||||||
cmds:
|
cmds:
|
||||||
- task: refine
|
- task: refine
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
||||||
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
vars: {DIR: '{{.DIR}}'}
|
||||||
|
|
||||||
refine:
|
refine:
|
||||||
dir: ./{{.PROJECT}}
|
dir: ./{{.DIR}}
|
||||||
vars:
|
vars:
|
||||||
PORT: 3334 # assign a different port for each project
|
PORT: 3335 # assign a different port for each project
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
||||||
|
DIR: '{{splitList ":" .TASK | first}}'
|
||||||
PROJECT: '{{splitList ":" .TASK | first}}'
|
PROJECT: '{{splitList ":" .TASK | first}}'
|
||||||
cmds:
|
cmds:
|
||||||
- task: :start # launch OpenRefine
|
- task: :start # launch OpenRefine
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||||||
- > # import file
|
- > # import file
|
||||||
../openrefine/client -P {{.PORT}}
|
"$CLIENT" -P {{.PORT}}
|
||||||
--create "$(readlink -m input/duplicates.csv)"
|
--create "$(readlink -m input/duplicates.csv)"
|
||||||
--encoding UTF-8
|
--encoding UTF-8
|
||||||
--projectName {{.PROJECT}}
|
--projectName {{.PROJECT}}
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- > # apply transformation rules
|
- > # apply transformation rules
|
||||||
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
||||||
--apply config/duplicates-deletion.json
|
--apply config/duplicates-deletion.json
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- > # export to file
|
- > # export to file
|
||||||
mkdir -p output &&
|
mkdir -p output &&
|
||||||
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
||||||
--output "$(readlink -m output/deduped.xls)"
|
--output "$(readlink -m output/deduped.xls)"
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- | # print allocated system resources
|
- | # print allocated system resources
|
||||||
|
@ -39,7 +41,7 @@ tasks:
|
||||||
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
||||||
> >(tee -a openrefine.log)
|
> >(tee -a openrefine.log)
|
||||||
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||||||
sources:
|
sources:
|
||||||
- input/**
|
- input/**
|
||||||
- config/**
|
- config/**
|
||||||
|
@ -52,4 +54,4 @@ tasks:
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
default: # enable standalone execution (running `task` in project directory)
|
||||||
cmds:
|
cmds:
|
||||||
- PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"
|
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|
||||||
|
|
|
@ -3,36 +3,38 @@ version: '3'
|
||||||
tasks:
|
tasks:
|
||||||
main:
|
main:
|
||||||
desc: Powerhouse Museum Tutorial
|
desc: Powerhouse Museum Tutorial
|
||||||
|
vars: {DIR: '{{splitList ":" .TASK | first}}'}
|
||||||
cmds:
|
cmds:
|
||||||
- task: refine
|
- task: refine
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
||||||
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
vars: {DIR: '{{.DIR}}'}
|
||||||
|
|
||||||
refine:
|
refine:
|
||||||
dir: ./{{.PROJECT}}
|
dir: ./{{.DIR}}
|
||||||
vars:
|
vars:
|
||||||
PORT: 3336 # assign a different port for each project
|
PORT: 3336 # assign a different port for each project
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
||||||
|
DIR: '{{splitList ":" .TASK | first}}'
|
||||||
PROJECT: '{{splitList ":" .TASK | first}}'
|
PROJECT: '{{splitList ":" .TASK | first}}'
|
||||||
deps: # will be executed each run independent of up-to-date check
|
deps: # will be executed each run independent of up-to-date check
|
||||||
- task: download
|
- task: download
|
||||||
cmds:
|
cmds:
|
||||||
- task: :start # launch OpenRefine
|
- task: :start # launch OpenRefine
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||||||
- > # import file
|
- > # import file
|
||||||
../openrefine/client -P {{.PORT}}
|
"$CLIENT" -P {{.PORT}}
|
||||||
--create "$(readlink -m input/phm-collection.tsv)"
|
--create "$(readlink -m input/phm-collection.tsv)"
|
||||||
--processQuotes false
|
--processQuotes false
|
||||||
--guessCellValueTypes true
|
--guessCellValueTypes true
|
||||||
--projectName {{.PROJECT}}
|
--projectName {{.PROJECT}}
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- > # apply transformation rules
|
- > # apply transformation rules
|
||||||
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
||||||
--apply config/phm-transform.json
|
--apply config/phm-transform.json
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- > # export to file
|
- > # export to file
|
||||||
mkdir -p output &&
|
mkdir -p output &&
|
||||||
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
||||||
--output "$(readlink -m output/phm-results.tsv)"
|
--output "$(readlink -m output/phm-results.tsv)"
|
||||||
> >(tee -a openrefine.log) 2>&1
|
> >(tee -a openrefine.log) 2>&1
|
||||||
- | # print allocated system resources
|
- | # print allocated system resources
|
||||||
|
@ -42,7 +44,7 @@ tasks:
|
||||||
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
||||||
> >(tee -a openrefine.log)
|
> >(tee -a openrefine.log)
|
||||||
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||||||
sources:
|
sources:
|
||||||
- input/**
|
- input/**
|
||||||
- config/**
|
- config/**
|
||||||
|
@ -53,7 +55,8 @@ tasks:
|
||||||
ignore_error: true # workaround to avoid an orphaned Java process on error
|
ignore_error: true # workaround to avoid an orphaned Java process on error
|
||||||
# https://github.com/go-task/task/issues/141
|
# https://github.com/go-task/task/issues/141
|
||||||
download:
|
download:
|
||||||
dir: '{{splitList ":" .TASK | first}}'
|
dir: ./{{.DIR}}
|
||||||
|
vars: {DIR: '{{splitList ":" .TASK | first}}'}
|
||||||
cmds:
|
cmds:
|
||||||
- mkdir -p input config
|
- mkdir -p input config
|
||||||
- > # Download input
|
- > # Download input
|
||||||
|
@ -65,4 +68,4 @@ tasks:
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
default: # enable standalone execution (running `task` in project directory)
|
||||||
cmds:
|
cmds:
|
||||||
- PROJECT="${PWD##*/}:main" && cd .. && task "$PROJECT"
|
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|
||||||
|
|
Loading…
Reference in New Issue