reduce complexity

This commit is contained in:
Felix Lohmeier 2022-04-06 13:30:59 +02:00
parent 1341e1b45c
commit 8ee91ee84f
11 changed files with 147 additions and 404 deletions

View File

@ -1,86 +0,0 @@
name: run all tasks
on:
workflow_dispatch: # allows you to run this workflow manually from the Actions tab
jobs:
example-doaj: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go-task 3.10.0
run: |
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: run task ${{ github.job }}
run: task "$GITHUB_JOB:main"
- uses: actions/upload-artifact@v2
if: always()
with:
name: ${{ github.job }}
path: ${{ github.job }}
retention-days: 5
- name: commit and push if output changed # has nothing to do currently because of .gitignore
run: |-
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
git add -A
git status
git commit -m "latest change: $(date -u)" || exit 0
git push
example-duplicates: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go-task 3.10.0
run: |
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: run task ${{ github.job }}
run: task "$GITHUB_JOB:main"
- uses: actions/upload-artifact@v2
if: always()
with:
name: ${{ github.job }}
path: ${{ github.job }}
retention-days: 5
- name: commit and push if output changed # has nothing to do currently because of .gitignore
run: |-
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
git add -A
git status
git commit -m "latest change: $(date -u)" || exit 0
git push
example-powerhouse: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go-task 3.10.0
run: |
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: run task ${{ github.job }}
run: task "$GITHUB_JOB:main"
- uses: actions/upload-artifact@v2
if: always()
with:
name: ${{ github.job }}
path: ${{ github.job }}
retention-days: 5
- name: commit and push if output changed # has nothing to do currently because of .gitignore
run: |-
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
git add -A
git status
git commit -m "latest change: $(date -u)" || exit 0
git push

44
.github/workflows/example.yml vendored Normal file
View File

@ -0,0 +1,44 @@
name: example
on:
workflow_dispatch: # allows you to run this workflow manually from the Actions tab
jobs:
main:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go-task 3.10.0
run: |
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
- name: install OpenRefine and openrefine-client
run: task install
- name: start OpenRefine
run: task start
- name: import
run: task import
- name: transform
run: task transform
- name: export
run: task export
- name: print stats
if: always()
run: task stats
- name: check log file
if: always()
run: task check
- name: stop OpenRefine
if: always()
run: task stop
- uses: actions/upload-artifact@v2
if: always()
with:
name: OpenRefine project and logfile
path: .openrefine/data
retention-days: 7
- name: git commit and push
run: |
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
task git

7
.gitignore vendored
View File

@ -1,9 +1,2 @@
.task
.openrefine
*/output
*/*.log
*/*.openrefine.tar.gz
example-doaj/input
example-doaj/config
example-powerhouse/input
example-powerhouse/config

View File

@ -1,21 +1,20 @@
# OpenRefine Task Runner (💎+🤖)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://notebooks.gesis.org/binder/badge_logo.svg)](https://notebooks.gesis.org/binder/v2/gh/opencultureconsulting/openrefine-task-runner/main?urlpath=lab/tree/demo.ipynb)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api).
## Features
* run tasks in parallel
* basic error handling by monitoring the OpenRefine server log
* dedicated OpenRefine instances for each task (your existing OpenRefine data will not be touched)
* dedicated OpenRefine instance with temporary workspace (your existing OpenRefine data will not be touched)
* prevent unnecessary work by fingerprinting generated files and their sources
* the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) used here supports many core features of OpenRefine:
* import CSV, TSV, line-based TXT, fixed-width TXT, JSON or XML (and specify input options)
* apply [undo/redo history](https://docs.openrefine.org/manual/running/#reusing-operations) from given JSON file(s)
* export to CSV, TSV, HTML, XLS, XLSX, ODS
* [templating export](https://github.com/opencultureconsulting/openrefine-client#templating) to additional formats like JSON or XML
* works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4, 3.4.1 and 3.5
* works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4 and 3.5
* tasks are easy to extend with additional commands (e.g. to download input data or validate results)
## Typical workflow
@ -26,10 +25,10 @@ Templates for OpenRefine batch processing (import, transform, export) using the
**Possible automation benefits:**
* When you receive updated data (in the same structure), you just need to drop the file and start the task like this:
* When you receive updated data (in the same structure), you just need to drop the input file and start the task like this:
```sh
task example-doaj
task
```
* The entire data processing (including options during import) becomes reproducible. The task configuration file can also be used for documentation through source code comments.
@ -38,18 +37,17 @@ Templates for OpenRefine batch processing (import, transform, export) using the
## Requirements
* GNU/Linux (tested with Fedora 32)
* GNU/Linux (tested with Fedora 34)
* JAVA 8+ (for OpenRefine)
## Demo via binder
[![Binder](https://notebooks.gesis.org/binder/badge_logo.svg)](https://notebooks.gesis.org/binder/v2/gh/opencultureconsulting/openrefine-task-runner/main?urlpath=lab/tree/demo.ipynb)
[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
- free to use on-demand server with Jupyterlab and Bash Kernel
- OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild)
- no registration needed, will start within a few minutes
- [restricted](https://notebooks.gesis.org/faq/) to 4 GB RAM and server will be deleted after 10 minutes of inactivity
- service is provided by GESIS and is intended for use by social scientists
- [restricted](https://mybinder.readthedocs.io/en/latest/about/about.html#how-much-memory-am-i-given-when-using-binder) to 2 GB RAM and server will be deleted after 10 minutes of inactivity
## Install
@ -60,7 +58,7 @@ Templates for OpenRefine batch processing (import, transform, export) using the
cd openrefine-task-runner
```
2. Install [Task 3.10.0](https://github.com/go-task/task/releases/tag/v3.10.0)
2. Install [Task 3.10.0](https://github.com/go-task/task/releases/tag/v3.10.0)+
a) RPM-based (Fedora, CentOS, SLES, etc.)
@ -84,34 +82,28 @@ Templates for OpenRefine batch processing (import, transform, export) using the
## Usage
* Run all tasks in parallel
* Run default task (start, import, transform, export, stats, check, kill and cleanup)
```sh
task
task default
```
* Run a specific task
* Override settings with environment variables
```sh
task example-duplicates:main
```
* Run some tasks in parallel
```sh
task --parallel example-duplicates:main example-doaj:main
OPENREFINE_MEMORY=2000M OPENREFINE_PORT=3334 task default
```
* Force run a task even when the task is up-to-date
```sh
task example-duplicates:main --force
task default --force
```
* Dry-run in verbose mode for debugging
```sh
task example-duplicates:main --dry --verbose --force
task default --dry --verbose --force
```
* List available tasks
@ -120,17 +112,9 @@ Templates for OpenRefine batch processing (import, transform, export) using the
task --list
```
### How to develop your own tasks
### Examples
(first draft, will be elaborated later)
1. create a new folder
2. copy an example Taskfile.yml
3. provide input data in subdirectory input
4. provide OpenRefine transformation history files in subdirectory config
5. add commands to specific Taskfile (check openrefine-client help screen for available options: `openrefine/client --help`)
6. add project to general Taskfile
7. check memory load and increase RAM if needed
* [noah-biejournals](https://github.com/opencultureconsulting/noah-biejournals): Harvesting des Zeitschriftenservers BieJournals der UB Bielefeld und Transformation in METS/MODS für das Portal noah.nrw
### Getting help

View File

@ -1,102 +1,109 @@
# https://github.com/opencultureconsulting/openrefine-task-runner
version: '3'
includes:
example-doaj: example-doaj
example-duplicates: example-duplicates
example-powerhouse: example-powerhouse
# add the directory name of your project here
silent: true
output: prefixed
env:
OPENREFINE:
sh: readlink -m .openrefine/refine
CLIENT:
sh: readlink -m .openrefine/client
OPENREFINE_MEMORY: 5120M
OPENREFINE_PORT: 3333
OPENREFINE_APPDIR:
sh: readlink -m .openrefine
OPENREFINE_TMPDIR:
sh: mktemp -d
tasks:
default:
desc: execute all projects in parallel
deps:
- task: example-doaj:refine
- task: example-duplicates:refine
- task: example-powerhouse:refine
# add the directory name of your project here
desc: run tasks start, import, transform, export, stats, check, kill and cleanup
cmds:
- task: check
- defer: { task: cleanup } # will run even when one of the following commands fail
- task: start
- defer: { task: kill } # will run before cleanup
- defer: { task: check } # will run before kill
- defer: { task: stats } # will run before check
- task: import
- task: transform
- task: export
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- output/**
install:
desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine
desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR}
cmds:
- | # delete existing install and recreate folder
rm -rf .openrefine
mkdir -p .openrefine
- > # download OpenRefine archive
wget --no-verbose -O openrefine.tar.gz
https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
- | # install OpenRefine into subdirectory .openrefine
tar -xzf openrefine.tar.gz -C .openrefine --strip 1
rm openrefine.tar.gz
- mkdir -p "${OPENREFINE_APPDIR}"
- | # install OpenRefine into subdirectory ${OPENREFINE_APPDIR}
wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
tar -xzf openrefine.tar.gz -C "${OPENREFINE_APPDIR}" --strip 1 && rm openrefine.tar.gz
- | # optimize OpenRefine for batch processing
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
- > # download openrefine-client into subdirectory .openrefine
wget --no-verbose -O .openrefine/client
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
- chmod +x .openrefine/client # make client executable
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "${OPENREFINE_APPDIR}/refine" # fix path issue in OpenRefine startup file
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "${OPENREFINE_APPDIR}/refine.ini" # do not try to open OpenRefine in browser
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "${OPENREFINE_APPDIR}/refine.ini" # set autosave period from 5 minutes to 25 hours
- | # install openrefine-client into subdirectory ${OPENREFINE_APPDIR}
wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
chmod +x "${OPENREFINE_APPDIR}/client"
start:
dir: ./{{.DIR}}
cmds:
- | # verify that OpenRefine is installed
if [ ! -f "$OPENREFINE" ]; then
- | # requirement OpenRefine
if [ ! -f "${OPENREFINE_APPDIR}/refine" ]; then
echo 1>&2 "OpenRefine missing; try task install"; exit 1
fi
- | # delete temporary files and log file of previous run
rm -rf ./*.project* workspace.json
rm -rf "{{.PROJECT}}.log"
- > # launch OpenRefine with specific data directory and redirect its output to a log file
"$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}}
-d ../{{.DIR}}
>> "{{.PROJECT}}.log" 2>&1 &
- | # launch OpenRefine with specific data directory and redirect its output to a log file
"${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" >> "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
- | # wait until OpenRefine API is available
timeout 30s bash -c "until
wget -q -O - -o /dev/null http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
do sleep 1
done"
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
stop:
dir: ./{{.DIR}}
cmds:
- | # shut down OpenRefine gracefully
PID=$(lsof -t -i:{{.PORT}})
kill $PID
while ps -p $PID > /dev/null; do sleep 1; done
- > # archive the OpenRefine project
tar cfz
"{{.PROJECT}}.openrefine.tar.gz"
-C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1)
.
- rm -rf ./*.project* workspace.json # delete temporary files
import:
- | # import (requires absolute path)
"${OPENREFINE_APPDIR}/client" \
--create "$(readlink -m input/duplicates.csv)" \
--projectName myproject \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
kill:
dir: ./{{.DIR}}
cmds:
- | # shut down OpenRefine immediately to save time and disk space
PID=$(lsof -t -i:{{.PORT}})
kill -9 $PID
while ps -p $PID > /dev/null; do sleep 1; done
- rm -rf ./*.project* workspace.json # delete temporary files
transform:
- | # apply undo/redo history
for f in config/*.json; do
"${OPENREFINE_APPDIR}/client" myproject --apply "$f" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
done
export:
- mkdir -p output
- | # export to TSV
"${OPENREFINE_APPDIR}/client" myproject \
--output "$(readlink -m output/deduped.tsv)" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
stats:
- | # print RAM and CPU usage
PID="$(lsof -t -i:${OPENREFINE_PORT})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
check:
desc: check OpenRefine log for any warnings and exit on error
dir: ./{{.DIR}}
cmds:
- | # find log file(s) and check for "exception" or "error"
if grep -i 'exception\|error' $(find . -name '*.log'); then
echo 1>&2 "log contains warnings!"; exit 1
- | # check log file for any warnings
if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
fi
stop:
- | # shut down OpenRefine gracefully
PID="$(lsof -t -i:${OPENREFINE_PORT})"
kill $PID; while ps -p $PID > /dev/null; do sleep 1; done
kill:
- | # shut down OpenRefine immediately to save time
PID="$(lsof -t -i:${OPENREFINE_PORT})"
kill -9 $PID
cleanup: rm -rf "${OPENREFINE_TMPDIR}"
git:
desc: commit and push if something changed
cmds:
- git add -A
- git commit -m "latest change $(date -u)" || exit 0
- git push

View File

@ -1 +0,0 @@
{"cells":[{"metadata":{},"cell_type":"markdown","source":"## Run all tasks in parallel"},{"metadata":{"trusted":true},"cell_type":"code","source":"task","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Run a specific task"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Run some tasks in parallel"},{"metadata":{"trusted":true},"cell_type":"code","source":"task --parallel example-duplicates:main example-doaj:main","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Force run a task even when the task is up-to-date"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main --force","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Dry-run in verbose mode for debugging"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main --dry --verbose --force","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## List available tasks"},{"metadata":{"trusted":true},"cell_type":"code","source":"task --list","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"bash","display_name":"Bash","language":"bash"},"language_info":{"name":"bash","codemirror_mode":"shell","mimetype":"text/x-sh","file_extension":".sh"}},"nbformat":4,"nbformat_minor":5}

View File

@ -1,70 +0,0 @@
version: '3'
tasks:
main:
desc: Library Carpentry Lesson covering DOAJ
vars:
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
refine:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: doaj
PORT: 3334 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
deps:
- task: download # will be executed each run independent of up-to-date check
cmds:
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # import file
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/doaj-article-sample.csv)"
--projectName "{{.PROJECT}}"
> {{.LOG}}
- > # apply transformation rules
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/doaj-openrefine.json
> {{.LOG}}
- mkdir -p output
- > # export to file
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--output "$(readlink -m output/doaj-results.tsv)"
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
download:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
cmds:
- mkdir -p input config
- > # Download input
wget --no-verbose -O input/doaj-article-sample.csv
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
- > # Download config
wget --no-verbose -O config/doaj-openrefine.json
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"

View File

@ -1,56 +0,0 @@
version: '3'
tasks:
main:
desc: Removing duplicates in a very small test dataset
vars:
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
refine:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: duplicates
PORT: 3335 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
cmds:
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # import file
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/duplicates.csv)"
--encoding UTF-8
--projectName "{{.PROJECT}}"
> {{.LOG}}
- > # apply transformation rules
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/duplicates-deletion.json
> {{.LOG}}
- mkdir -p output
- > # export to file
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--output "$(readlink -m output/deduped.xls)"
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"

View File

@ -1,72 +0,0 @@
version: '3'
tasks:
main:
desc: Powerhouse Museum Tutorial
vars:
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: refine
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
refine:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: phm
PORT: 3336 # assign a different port for each project
RAM: 2048M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
deps:
- task: download # will be executed each run independent of up-to-date check
cmds:
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # import file
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/phm-collection.tsv)"
--processQuotes false
--guessCellValueTypes true
--projectName "{{.PROJECT}}"
> {{.LOG}}
- > # apply transformation rules
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/phm-transform.json
> {{.LOG}}
- mkdir -p output
- > # export to file
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--output "$(readlink -m output/phm-results.tsv)"
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', PROJECT: '{{.PROJECT}}'}
sources:
- Taskfile.yml
- input/**
- config/**
generates:
- ./{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
download:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
cmds:
- mkdir -p input config
- > # Download input
wget --no-verbose -O input/phm-collection.tsv
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
- > # Download config
wget --no-verbose -O config/phm-transform.json
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"