🎨 restructure taskfiles
This commit is contained in:
parent
62eb0cddbf
commit
6789554c60
|
@ -1,6 +1,8 @@
|
||||||
.task
|
.task
|
||||||
openrefine
|
openrefine
|
||||||
*/output
|
*/output
|
||||||
|
*/openrefine.log
|
||||||
|
*/*.openrefine.tar.gz
|
||||||
example-doaj/input
|
example-doaj/input
|
||||||
example-doaj/config
|
example-doaj/config
|
||||||
example-powerhouse/input
|
example-powerhouse/input
|
||||||
|
|
|
@ -139,4 +139,4 @@ Please file an [issue](https://github.com/opencultureconsulting/openrefine-task-
|
||||||
- [ ] how-to for extracting input options from OpenRefine GUI (via metadata in open project)
|
- [ ] how-to for extracting input options from OpenRefine GUI (via metadata in open project)
|
||||||
- [ ] document known issues, e.g. [import xls, xlsx, ods](https://github.com/opencultureconsulting/openrefine-client/issues/4)
|
- [ ] document known issues, e.g. [import xls, xlsx, ods](https://github.com/opencultureconsulting/openrefine-client/issues/4)
|
||||||
- [ ] add Binder files and badge
|
- [ ] add Binder files and badge
|
||||||
- [ ] add example notebooks (links to nbviewer and Binder)
|
- [ ] add example notebooks (links to nbviewer and Binder)
|
||||||
|
|
63
Taskfile.yml
63
Taskfile.yml
|
@ -3,15 +3,9 @@
|
||||||
version: '3'
|
version: '3'
|
||||||
|
|
||||||
includes:
|
includes:
|
||||||
example-doaj:
|
example-doaj: example-doaj
|
||||||
taskfile: example-doaj
|
example-duplicates: example-duplicates
|
||||||
dir: example-doaj
|
example-powerhouse: example-powerhouse
|
||||||
example-duplicates:
|
|
||||||
taskfile: example-duplicates
|
|
||||||
dir: example-duplicates
|
|
||||||
example-powerhouse:
|
|
||||||
taskfile: example-powerhouse
|
|
||||||
dir: example-powerhouse
|
|
||||||
# add your project here
|
# add your project here
|
||||||
|
|
||||||
silent: true
|
silent: true
|
||||||
|
@ -32,29 +26,35 @@ tasks:
|
||||||
desc: (re)install OpenRefine and openrefine-client into subdirectory openrefine
|
desc: (re)install OpenRefine and openrefine-client into subdirectory openrefine
|
||||||
cmds:
|
cmds:
|
||||||
- | # delete existing install and recreate folder
|
- | # delete existing install and recreate folder
|
||||||
rm -rf openrefine; mkdir -p openrefine
|
rm -rf openrefine
|
||||||
- | # install OpenRefine into subdirectory openrefine
|
mkdir -p openrefine
|
||||||
wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
|
- > # download OpenRefine archive
|
||||||
tar -xzf openrefine.tar.gz -C openrefine --strip 1 && rm openrefine.tar.gz
|
wget --no-verbose -O openrefine.tar.gz
|
||||||
- sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file
|
https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
|
||||||
- sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser
|
- > # install OpenRefine into subdirectory openrefine
|
||||||
- sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
|
tar -xzf openrefine.tar.gz -C openrefine --strip 1
|
||||||
- | # install openrefine-client into subdirectory openrefine
|
&& rm openrefine.tar.gz
|
||||||
wget --no-verbose -O openrefine/client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
|
- | # optimize OpenRefine for batch processing
|
||||||
chmod +x openrefine/client
|
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "openrefine/refine" # fix path issue in OpenRefine startup file
|
||||||
|
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" # do not try to open OpenRefine in browser
|
||||||
|
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
|
||||||
|
- > # download openrefine-client into subdirectory openrefine
|
||||||
|
wget --no-verbose -O openrefine/client
|
||||||
|
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
|
||||||
|
&& chmod +x openrefine/client
|
||||||
|
|
||||||
start:
|
start:
|
||||||
dir: ./{{.PROJECT}}/output
|
dir: ./{{.PROJECT}}
|
||||||
cmds:
|
cmds:
|
||||||
- | # check install and delete any temporary OpenRefine files
|
- | # check install and delete any temporary OpenRefine files
|
||||||
if [ ! -f "../../openrefine/refine" ]; then
|
if [ ! -f "../openrefine/refine" ]; then
|
||||||
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
||||||
fi
|
fi
|
||||||
rm -rf ./*.project* workspace.json
|
rm -rf ./*.project* workspace.json
|
||||||
- | # launch OpenRefine with specific data directory and redirect its output to a log file
|
- > # launch OpenRefine with specific data directory and redirect its output to a log file
|
||||||
../../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}} \
|
../openrefine/refine -v warn -p {{.PORT}} -m {{.RAM}}
|
||||||
-d ../{{.PROJECT}}/output \
|
-d ../{{.PROJECT}}
|
||||||
> openrefine.log 2>&1 &
|
> openrefine.log 2>&1 &
|
||||||
- | # wait until OpenRefine API is available
|
- | # wait until OpenRefine API is available
|
||||||
timeout 30s bash -c "until
|
timeout 30s bash -c "until
|
||||||
wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
|
wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
|
||||||
|
@ -62,17 +62,18 @@ tasks:
|
||||||
done"
|
done"
|
||||||
|
|
||||||
stop:
|
stop:
|
||||||
dir: ./{{.PROJECT}}/output
|
dir: ./{{.PROJECT}}
|
||||||
cmds:
|
cmds:
|
||||||
- | # shut down OpenRefine
|
- | # shut down OpenRefine
|
||||||
PID=$(lsof -t -i:{{.PORT}})
|
PID=$(lsof -t -i:{{.PORT}})
|
||||||
kill $PID
|
kill $PID
|
||||||
while ps -p $PID > /dev/null; do sleep 1; done
|
while ps -p $PID > /dev/null; do sleep 1; done
|
||||||
- | # archive the OpenRefine project
|
- > # archive the OpenRefine project and delete temporary files
|
||||||
tar cfz \
|
tar cfz
|
||||||
{{.PROJECT}}.openrefine.tar.gz \
|
{{.PROJECT}}.openrefine.tar.gz
|
||||||
-C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1) \
|
-C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1)
|
||||||
.
|
.
|
||||||
|
&& rm -rf ./*.project* workspace.json
|
||||||
|
|
||||||
check:
|
check:
|
||||||
desc: check OpenRefine log for any warnings and exit on error
|
desc: check OpenRefine log for any warnings and exit on error
|
||||||
|
|
|
@ -6,71 +6,56 @@ tasks:
|
||||||
cmds:
|
cmds:
|
||||||
- task: refine
|
- task: refine
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
||||||
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
||||||
|
|
||||||
refine:
|
refine:
|
||||||
|
dir: ./{{.PROJECT}}
|
||||||
vars:
|
vars:
|
||||||
PORT: 3335 # assign a different port for each project
|
PORT: 3335 # assign a different port for each project
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
||||||
PROJECT: '{{splitList ":" .TASK | first}}'
|
PROJECT: '{{splitList ":" .TASK | first}}'
|
||||||
deps: # will be executed each run independent of up-to-date check
|
deps: # will be executed each run independent of up-to-date check
|
||||||
- task: download
|
- task: download
|
||||||
cmds: # tasks prepended with ":" are defined in Taskfile.yml
|
cmds:
|
||||||
- task: :start
|
- task: :start # launch OpenRefine
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||||||
- task: import
|
- > # import file
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
../openrefine/client -P {{.PORT}}
|
||||||
- task: apply
|
--create "$(readlink -m input/doaj-article-sample.csv)"
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
--projectName {{.PROJECT}}
|
||||||
- task: export
|
- > # apply transformation rules
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
||||||
- task: stats
|
--apply config/doaj-openrefine.json
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
- > # export to file
|
||||||
- task: :stop
|
mkdir -p output &&
|
||||||
|
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
||||||
|
--output "$(readlink -m output/doaj-results.tsv)"
|
||||||
|
- | # print allocated system resources
|
||||||
|
PID="$(lsof -t -i:{{.PORT}})"
|
||||||
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
|
||||||
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
|
||||||
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||||||
sources:
|
sources:
|
||||||
- input/**
|
- input/**
|
||||||
- config/**
|
- config/**
|
||||||
generates:
|
generates:
|
||||||
- output/openrefine.log
|
- openrefine.log
|
||||||
- output/{{.PROJECT}}.openrefine.tar.gz
|
- ./{{.PROJECT}}.openrefine.tar.gz
|
||||||
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
|
- output/**
|
||||||
|
ignore_error: true # workaround to avoid an orphaned Java process on error
|
||||||
|
# https://github.com/go-task/task/issues/141
|
||||||
|
|
||||||
download:
|
download:
|
||||||
|
dir: '{{splitList ":" .TASK | first}}'
|
||||||
cmds:
|
cmds:
|
||||||
- mkdir -p input config
|
- mkdir -p input config
|
||||||
- wget --no-verbose -O input/doaj-article-sample.csv https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
|
- > # Download input
|
||||||
- wget --no-verbose -O config/doaj-openrefine.json https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
|
wget --no-verbose -O input/doaj-article-sample.csv
|
||||||
|
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
|
||||||
import:
|
- > # Download config
|
||||||
dir: input
|
wget --no-verbose -O config/doaj-openrefine.json
|
||||||
cmds:
|
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
|
||||||
- | # import file
|
|
||||||
../../openrefine/client -P {{.PORT}} \
|
|
||||||
--create doaj-article-sample.csv \
|
|
||||||
--projectName {{.PROJECT}}
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
apply:
|
|
||||||
dir: config
|
|
||||||
cmds:
|
|
||||||
- | # apply transformation rules
|
|
||||||
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
|
|
||||||
--apply doaj-openrefine.json
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
export:
|
|
||||||
dir: output
|
|
||||||
cmds:
|
|
||||||
- | # export to file; use readlink to log full path to output file
|
|
||||||
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
|
|
||||||
--output "$(readlink -m doaj-results.tsv)"
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
stats:
|
|
||||||
cmds:
|
|
||||||
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
default: # enable standalone execution (running `task` in project directory)
|
||||||
cmds:
|
cmds:
|
||||||
|
|
|
@ -6,64 +6,44 @@ tasks:
|
||||||
cmds:
|
cmds:
|
||||||
- task: refine
|
- task: refine
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
||||||
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
||||||
|
|
||||||
refine:
|
refine:
|
||||||
|
dir: ./{{.PROJECT}}
|
||||||
vars:
|
vars:
|
||||||
PORT: 3334 # assign a different port for each project
|
PORT: 3334 # assign a different port for each project
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
||||||
PROJECT: '{{splitList ":" .TASK | first}}'
|
PROJECT: '{{splitList ":" .TASK | first}}'
|
||||||
cmds: # tasks prepended with ":" are defined in Taskfile.yml
|
cmds:
|
||||||
- task: :start
|
- task: :start # launch OpenRefine
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||||||
- task: import
|
- > # import file
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
../openrefine/client -P {{.PORT}}
|
||||||
- task: apply
|
--create "$(readlink -m input/duplicates.csv)"
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
--encoding UTF-8
|
||||||
- task: export
|
--projectName {{.PROJECT}}
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
- > # apply transformation rules
|
||||||
- task: stats
|
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
--apply config/duplicates-deletion.json
|
||||||
- task: :stop
|
- > # export to file
|
||||||
|
mkdir -p output &&
|
||||||
|
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
||||||
|
--output "$(readlink -m output/deduped.xls)"
|
||||||
|
- | # print allocated system resources
|
||||||
|
PID="$(lsof -t -i:{{.PORT}})"
|
||||||
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
|
||||||
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
|
||||||
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||||||
sources:
|
sources:
|
||||||
- input/**
|
- input/**
|
||||||
- config/**
|
- config/**
|
||||||
generates:
|
generates:
|
||||||
- output/openrefine.log
|
- openrefine.log
|
||||||
- output/{{.PROJECT}}.openrefine.tar.gz
|
- ./{{.PROJECT}}.openrefine.tar.gz
|
||||||
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
|
- output/**
|
||||||
|
ignore_error: true # workaround to avoid an orphaned Java process on error
|
||||||
import:
|
# https://github.com/go-task/task/issues/141
|
||||||
dir: input
|
|
||||||
cmds:
|
|
||||||
- | # import file
|
|
||||||
../../openrefine/client -P {{.PORT}} \
|
|
||||||
--create duplicates.csv \
|
|
||||||
--encoding UTF-8 \
|
|
||||||
--projectName {{.PROJECT}}
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
apply:
|
|
||||||
dir: config
|
|
||||||
cmds:
|
|
||||||
- | # apply transformation rules
|
|
||||||
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
|
|
||||||
--apply duplicates-deletion.json
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
export:
|
|
||||||
dir: output
|
|
||||||
cmds:
|
|
||||||
- | # export to file; use readlink to log full path to output file
|
|
||||||
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
|
|
||||||
--output "$(readlink -m deduped.xls)"
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
stats:
|
|
||||||
cmds:
|
|
||||||
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
default: # enable standalone execution (running `task` in project directory)
|
||||||
cmds:
|
cmds:
|
||||||
|
|
|
@ -6,73 +6,57 @@ tasks:
|
||||||
cmds:
|
cmds:
|
||||||
- task: refine
|
- task: refine
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
||||||
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
vars: {PROJECT: '{{splitList ":" .TASK | first}}'}
|
||||||
|
|
||||||
refine:
|
refine:
|
||||||
|
dir: ./{{.PROJECT}}
|
||||||
vars:
|
vars:
|
||||||
PORT: 3336 # assign a different port for each project
|
PORT: 3336 # assign a different port for each project
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
||||||
PROJECT: '{{splitList ":" .TASK | first}}'
|
PROJECT: '{{splitList ":" .TASK | first}}'
|
||||||
deps: # will be executed each run independent of up-to-date check
|
deps: # will be executed each run independent of up-to-date check
|
||||||
- task: download
|
- task: download
|
||||||
cmds: # tasks prepended with ":" are defined in Taskfile.yml
|
cmds:
|
||||||
- task: :start
|
- task: :start # launch OpenRefine
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||||||
- task: import
|
- > # import file
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
../openrefine/client -P {{.PORT}}
|
||||||
- task: apply
|
--create "$(readlink -m input/phm-collection.tsv)"
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
--processQuotes false
|
||||||
- task: export
|
--guessCellValueTypes true
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
--projectName {{.PROJECT}}
|
||||||
- task: stats
|
- > # apply transformation rules
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
||||||
- task: :stop
|
--apply config/phm-transform.json
|
||||||
|
- > # export to file
|
||||||
|
mkdir -p output &&
|
||||||
|
../openrefine/client -P {{.PORT}} {{.PROJECT}}
|
||||||
|
--output "$(readlink -m output/phm-results.tsv)"
|
||||||
|
- | # print allocated system resources
|
||||||
|
PID="$(lsof -t -i:{{.PORT}})"
|
||||||
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM"
|
||||||
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time"
|
||||||
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
||||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||||||
sources:
|
sources:
|
||||||
- input/**
|
- input/**
|
||||||
- config/**
|
- config/**
|
||||||
generates:
|
generates:
|
||||||
- output/openrefine.log
|
- openrefine.log
|
||||||
- output/{{.PROJECT}}.openrefine.tar.gz
|
- ./{{.PROJECT}}.openrefine.tar.gz
|
||||||
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
|
- output/**
|
||||||
|
ignore_error: true # workaround to avoid an orphaned Java process on error
|
||||||
|
# https://github.com/go-task/task/issues/141
|
||||||
download:
|
download:
|
||||||
|
dir: '{{splitList ":" .TASK | first}}'
|
||||||
cmds:
|
cmds:
|
||||||
- mkdir -p input config
|
- mkdir -p input config
|
||||||
- wget --no-verbose -O input/phm-collection.tsv https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
|
- > # Download input
|
||||||
- wget --no-verbose -O config/phm-transform.json https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
|
wget --no-verbose -O input/phm-collection.tsv
|
||||||
|
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
|
||||||
import:
|
- > # Download config
|
||||||
dir: input
|
wget --no-verbose -O config/phm-transform.json
|
||||||
cmds:
|
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
|
||||||
- | # import file
|
|
||||||
../../openrefine/client -P {{.PORT}} \
|
|
||||||
--create phm-collection.tsv \
|
|
||||||
--processQuotes false \
|
|
||||||
--guessCellValueTypes true \
|
|
||||||
--projectName {{.PROJECT}}
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
apply:
|
|
||||||
dir: config
|
|
||||||
cmds:
|
|
||||||
- | # apply transformation rules
|
|
||||||
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
|
|
||||||
--apply phm-transform.json
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
export:
|
|
||||||
dir: output
|
|
||||||
cmds:
|
|
||||||
- | # export to file; use readlink to log full path to output file
|
|
||||||
../../openrefine/client -P {{.PORT}} {{.PROJECT}} \
|
|
||||||
--output "$(readlink -m phm-results.tsv)"
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
stats:
|
|
||||||
cmds:
|
|
||||||
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}}) # print allocated system resources
|
|
||||||
ignore_error: true # workaround
|
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
default: # enable standalone execution (running `task` in project directory)
|
||||||
cmds:
|
cmds:
|
||||||
|
|
Loading…
Reference in New Issue