reduce complexity
This commit is contained in:
parent
1341e1b45c
commit
8ee91ee84f
|
@ -1,86 +0,0 @@
|
||||||
name: run all tasks
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allows you to run this workflow manually from the Actions tab
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
example-doaj: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
|
|
||||||
runs-on: ubuntu-20.04
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v2
|
|
||||||
- name: install go-task 3.10.0
|
|
||||||
run: |
|
|
||||||
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
|
|
||||||
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
|
|
||||||
- name: install OpenRefine and openrefine-client
|
|
||||||
run: task install
|
|
||||||
- name: run task ${{ github.job }}
|
|
||||||
run: task "$GITHUB_JOB:main"
|
|
||||||
- uses: actions/upload-artifact@v2
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: ${{ github.job }}
|
|
||||||
path: ${{ github.job }}
|
|
||||||
retention-days: 5
|
|
||||||
- name: commit and push if output changed # has nothing to do currently because of .gitignore
|
|
||||||
run: |-
|
|
||||||
git config user.name "Automated"
|
|
||||||
git config user.email "actions@users.noreply.github.com"
|
|
||||||
git add -A
|
|
||||||
git status
|
|
||||||
git commit -m "latest change: $(date -u)" || exit 0
|
|
||||||
git push
|
|
||||||
|
|
||||||
example-duplicates: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
|
|
||||||
runs-on: ubuntu-20.04
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v2
|
|
||||||
- name: install go-task 3.10.0
|
|
||||||
run: |
|
|
||||||
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
|
|
||||||
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
|
|
||||||
- name: install OpenRefine and openrefine-client
|
|
||||||
run: task install
|
|
||||||
- name: run task ${{ github.job }}
|
|
||||||
run: task "$GITHUB_JOB:main"
|
|
||||||
- uses: actions/upload-artifact@v2
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: ${{ github.job }}
|
|
||||||
path: ${{ github.job }}
|
|
||||||
retention-days: 5
|
|
||||||
- name: commit and push if output changed # has nothing to do currently because of .gitignore
|
|
||||||
run: |-
|
|
||||||
git config user.name "Automated"
|
|
||||||
git config user.email "actions@users.noreply.github.com"
|
|
||||||
git add -A
|
|
||||||
git status
|
|
||||||
git commit -m "latest change: $(date -u)" || exit 0
|
|
||||||
git push
|
|
||||||
|
|
||||||
example-powerhouse: # available as environment variable $GITHUB_JOB and context ${{ github.job }}
|
|
||||||
runs-on: ubuntu-20.04
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v2
|
|
||||||
- name: install go-task 3.10.0
|
|
||||||
run: |
|
|
||||||
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
|
|
||||||
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
|
|
||||||
- name: install OpenRefine and openrefine-client
|
|
||||||
run: task install
|
|
||||||
- name: run task ${{ github.job }}
|
|
||||||
run: task "$GITHUB_JOB:main"
|
|
||||||
- uses: actions/upload-artifact@v2
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: ${{ github.job }}
|
|
||||||
path: ${{ github.job }}
|
|
||||||
retention-days: 5
|
|
||||||
- name: commit and push if output changed # has nothing to do currently because of .gitignore
|
|
||||||
run: |-
|
|
||||||
git config user.name "Automated"
|
|
||||||
git config user.email "actions@users.noreply.github.com"
|
|
||||||
git add -A
|
|
||||||
git status
|
|
||||||
git commit -m "latest change: $(date -u)" || exit 0
|
|
||||||
git push
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
name: example
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows you to run this workflow manually from the Actions tab
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
main:
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: install go-task 3.10.0
|
||||||
|
run: |
|
||||||
|
wget --no-verbose -O task.tar.gz https://github.com/go-task/task/releases/download/v3.10.0/task_linux_amd64.tar.gz
|
||||||
|
sudo tar -xzf task.tar.gz -C /usr/local/bin task && rm task.tar.gz
|
||||||
|
- name: install OpenRefine and openrefine-client
|
||||||
|
run: task install
|
||||||
|
- name: start OpenRefine
|
||||||
|
run: task start
|
||||||
|
- name: import
|
||||||
|
run: task import
|
||||||
|
- name: transform
|
||||||
|
run: task transform
|
||||||
|
- name: export
|
||||||
|
run: task export
|
||||||
|
- name: print stats
|
||||||
|
if: always()
|
||||||
|
run: task stats
|
||||||
|
- name: check log file
|
||||||
|
if: always()
|
||||||
|
run: task check
|
||||||
|
- name: stop OpenRefine
|
||||||
|
if: always()
|
||||||
|
run: task stop
|
||||||
|
- uses: actions/upload-artifact@v2
|
||||||
|
if: always()
|
||||||
|
with:
|
||||||
|
name: OpenRefine project and logfile
|
||||||
|
path: .openrefine/data
|
||||||
|
retention-days: 7
|
||||||
|
- name: git commit and push
|
||||||
|
run: |
|
||||||
|
git config user.name "Automated"
|
||||||
|
git config user.email "actions@users.noreply.github.com"
|
||||||
|
task git
|
|
@ -1,9 +1,2 @@
|
||||||
.task
|
.task
|
||||||
.openrefine
|
.openrefine
|
||||||
*/output
|
|
||||||
*/*.log
|
|
||||||
*/*.openrefine.tar.gz
|
|
||||||
example-doaj/input
|
|
||||||
example-doaj/config
|
|
||||||
example-powerhouse/input
|
|
||||||
example-powerhouse/config
|
|
||||||
|
|
50
README.md
50
README.md
|
@ -1,21 +1,20 @@
|
||||||
# OpenRefine Task Runner (💎+🤖)
|
# OpenRefine Task Runner (💎+🤖)
|
||||||
|
|
||||||
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://notebooks.gesis.org/binder/badge_logo.svg)](https://notebooks.gesis.org/binder/v2/gh/opencultureconsulting/openrefine-task-runner/main?urlpath=lab/tree/demo.ipynb)
|
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/888dbf663fdd409e8d8fcf8472114194)](https://www.codacy.com/gh/opencultureconsulting/openrefine-task-runner/dashboard) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
|
||||||
|
|
||||||
Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api).
|
Templates for OpenRefine batch processing (import, transform, export) using the task runner [go-task](https://github.com/go-task/task) and the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) to control OpenRefine via [its HTTP API](https://docs.openrefine.org/technical-reference/openrefine-api).
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
* run tasks in parallel
|
|
||||||
* basic error handling by monitoring the OpenRefine server log
|
* basic error handling by monitoring the OpenRefine server log
|
||||||
* dedicated OpenRefine instances for each task (your existing OpenRefine data will not be touched)
|
* dedicated OpenRefine instance with temporary workspace (your existing OpenRefine data will not be touched)
|
||||||
* prevent unnecessary work by fingerprinting generated files and their sources
|
* prevent unnecessary work by fingerprinting generated files and their sources
|
||||||
* the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) used here supports many core features of OpenRefine:
|
* the [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) used here supports many core features of OpenRefine:
|
||||||
* import CSV, TSV, line-based TXT, fixed-width TXT, JSON or XML (and specify input options)
|
* import CSV, TSV, line-based TXT, fixed-width TXT, JSON or XML (and specify input options)
|
||||||
* apply [undo/redo history](https://docs.openrefine.org/manual/running/#reusing-operations) from given JSON file(s)
|
* apply [undo/redo history](https://docs.openrefine.org/manual/running/#reusing-operations) from given JSON file(s)
|
||||||
* export to CSV, TSV, HTML, XLS, XLSX, ODS
|
* export to CSV, TSV, HTML, XLS, XLSX, ODS
|
||||||
* [templating export](https://github.com/opencultureconsulting/openrefine-client#templating) to additional formats like JSON or XML
|
* [templating export](https://github.com/opencultureconsulting/openrefine-client#templating) to additional formats like JSON or XML
|
||||||
* works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4, 3.4.1 and 3.5
|
* works with OpenRefine 2.7, 2.8, 3.0, 3.1, 3.2, 3.3, 3.4 and 3.5
|
||||||
* tasks are easy to extend with additional commands (e.g. to download input data or validate results)
|
* tasks are easy to extend with additional commands (e.g. to download input data or validate results)
|
||||||
|
|
||||||
## Typical workflow
|
## Typical workflow
|
||||||
|
@ -26,10 +25,10 @@ Templates for OpenRefine batch processing (import, transform, export) using the
|
||||||
|
|
||||||
**Possible automation benefits:**
|
**Possible automation benefits:**
|
||||||
|
|
||||||
* When you receive updated data (in the same structure), you just need to drop the file and start the task like this:
|
* When you receive updated data (in the same structure), you just need to drop the input file and start the task like this:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
task example-doaj
|
task
|
||||||
```
|
```
|
||||||
|
|
||||||
* The entire data processing (including options during import) becomes reproducible. The task configuration file can also be used for documentation through source code comments.
|
* The entire data processing (including options during import) becomes reproducible. The task configuration file can also be used for documentation through source code comments.
|
||||||
|
@ -38,18 +37,17 @@ Templates for OpenRefine batch processing (import, transform, export) using the
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
* GNU/Linux (tested with Fedora 32)
|
* GNU/Linux (tested with Fedora 34)
|
||||||
* JAVA 8+ (for OpenRefine)
|
* JAVA 8+ (for OpenRefine)
|
||||||
|
|
||||||
## Demo via binder
|
## Demo via binder
|
||||||
|
|
||||||
[![Binder](https://notebooks.gesis.org/binder/badge_logo.svg)](https://notebooks.gesis.org/binder/v2/gh/opencultureconsulting/openrefine-task-runner/main?urlpath=lab/tree/demo.ipynb)
|
[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/felixlohmeier/openrefineder/master)
|
||||||
|
|
||||||
- free to use on-demand server with Jupyterlab and Bash Kernel
|
- free to use on-demand server with Jupyterlab and Bash Kernel
|
||||||
- OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild)
|
- OpenRefine, openrefine-client and go-task [preinstalled](binder/postBuild)
|
||||||
- no registration needed, will start within a few minutes
|
- no registration needed, will start within a few minutes
|
||||||
- [restricted](https://notebooks.gesis.org/faq/) to 4 GB RAM and server will be deleted after 10 minutes of inactivity
|
- [restricted](https://mybinder.readthedocs.io/en/latest/about/about.html#how-much-memory-am-i-given-when-using-binder) to 2 GB RAM and server will be deleted after 10 minutes of inactivity
|
||||||
- service is provided by GESIS and is intended for use by social scientists
|
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
|
@ -60,7 +58,7 @@ Templates for OpenRefine batch processing (import, transform, export) using the
|
||||||
cd openrefine-task-runner
|
cd openrefine-task-runner
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Install [Task 3.10.0](https://github.com/go-task/task/releases/tag/v3.10.0)
|
2. Install [Task 3.10.0](https://github.com/go-task/task/releases/tag/v3.10.0)+
|
||||||
|
|
||||||
a) RPM-based (Fedora, CentOS, SLES, etc.)
|
a) RPM-based (Fedora, CentOS, SLES, etc.)
|
||||||
|
|
||||||
|
@ -84,34 +82,28 @@ Templates for OpenRefine batch processing (import, transform, export) using the
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
* Run all tasks in parallel
|
* Run default task (start, import, transform, export, stats, check, kill and cleanup)
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
task
|
task default
|
||||||
```
|
```
|
||||||
|
|
||||||
* Run a specific task
|
* Override settings with environment variables
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
task example-duplicates:main
|
OPENREFINE_MEMORY=2000M OPENREFINE_PORT=3334 task default
|
||||||
```
|
|
||||||
|
|
||||||
* Run some tasks in parallel
|
|
||||||
|
|
||||||
```sh
|
|
||||||
task --parallel example-duplicates:main example-doaj:main
|
|
||||||
```
|
```
|
||||||
|
|
||||||
* Force run a task even when the task is up-to-date
|
* Force run a task even when the task is up-to-date
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
task example-duplicates:main --force
|
task default --force
|
||||||
```
|
```
|
||||||
|
|
||||||
* Dry-run in verbose mode for debugging
|
* Dry-run in verbose mode for debugging
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
task example-duplicates:main --dry --verbose --force
|
task default --dry --verbose --force
|
||||||
```
|
```
|
||||||
|
|
||||||
* List available tasks
|
* List available tasks
|
||||||
|
@ -120,17 +112,9 @@ Templates for OpenRefine batch processing (import, transform, export) using the
|
||||||
task --list
|
task --list
|
||||||
```
|
```
|
||||||
|
|
||||||
### How to develop your own tasks
|
### Examples
|
||||||
|
|
||||||
(first draft, will be elaborated later)
|
* [noah-biejournals](https://github.com/opencultureconsulting/noah-biejournals): Harvesting des Zeitschriftenservers BieJournals der UB Bielefeld und Transformation in METS/MODS für das Portal noah.nrw
|
||||||
|
|
||||||
1. create a new folder
|
|
||||||
2. copy an example Taskfile.yml
|
|
||||||
3. provide input data in subdirectory input
|
|
||||||
4. provide OpenRefine transformation history files in subdirectory config
|
|
||||||
5. add commands to specific Taskfile (check openrefine-client help screen for available options: `openrefine/client --help`)
|
|
||||||
6. add project to general Taskfile
|
|
||||||
7. check memory load and increase RAM if needed
|
|
||||||
|
|
||||||
### Getting help
|
### Getting help
|
||||||
|
|
||||||
|
|
165
Taskfile.yml
165
Taskfile.yml
|
@ -1,102 +1,109 @@
|
||||||
# https://github.com/opencultureconsulting/openrefine-task-runner
|
|
||||||
|
|
||||||
version: '3'
|
version: '3'
|
||||||
|
|
||||||
includes:
|
|
||||||
example-doaj: example-doaj
|
|
||||||
example-duplicates: example-duplicates
|
|
||||||
example-powerhouse: example-powerhouse
|
|
||||||
# add the directory name of your project here
|
|
||||||
|
|
||||||
silent: true
|
silent: true
|
||||||
output: prefixed
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OPENREFINE:
|
OPENREFINE_MEMORY: 5120M
|
||||||
sh: readlink -m .openrefine/refine
|
OPENREFINE_PORT: 3333
|
||||||
CLIENT:
|
OPENREFINE_APPDIR:
|
||||||
sh: readlink -m .openrefine/client
|
sh: readlink -m .openrefine
|
||||||
|
OPENREFINE_TMPDIR:
|
||||||
|
sh: mktemp -d
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
default:
|
default:
|
||||||
desc: execute all projects in parallel
|
desc: run tasks start, import, transform, export, stats, check, kill and cleanup
|
||||||
deps:
|
|
||||||
- task: example-doaj:refine
|
|
||||||
- task: example-duplicates:refine
|
|
||||||
- task: example-powerhouse:refine
|
|
||||||
# add the directory name of your project here
|
|
||||||
cmds:
|
cmds:
|
||||||
- task: check
|
- defer: { task: cleanup } # will run even when one of the following commands fail
|
||||||
|
- task: start
|
||||||
|
- defer: { task: kill } # will run before cleanup
|
||||||
|
- defer: { task: check } # will run before kill
|
||||||
|
- defer: { task: stats } # will run before check
|
||||||
|
- task: import
|
||||||
|
- task: transform
|
||||||
|
- task: export
|
||||||
|
sources:
|
||||||
|
- Taskfile.yml
|
||||||
|
- input/**
|
||||||
|
- config/**
|
||||||
|
generates:
|
||||||
|
- output/**
|
||||||
|
|
||||||
install:
|
install:
|
||||||
desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine
|
desc: install OpenRefine and openrefine-client into subdirectory ${OPENREFINE_APPDIR}
|
||||||
cmds:
|
cmds:
|
||||||
- | # delete existing install and recreate folder
|
- mkdir -p "${OPENREFINE_APPDIR}"
|
||||||
rm -rf .openrefine
|
- | # install OpenRefine into subdirectory ${OPENREFINE_APPDIR}
|
||||||
mkdir -p .openrefine
|
wget --no-verbose -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
|
||||||
- > # download OpenRefine archive
|
tar -xzf openrefine.tar.gz -C "${OPENREFINE_APPDIR}" --strip 1 && rm openrefine.tar.gz
|
||||||
wget --no-verbose -O openrefine.tar.gz
|
|
||||||
https://github.com/OpenRefine/OpenRefine/releases/download/3.5.2/openrefine-linux-3.5.2.tar.gz
|
|
||||||
- | # install OpenRefine into subdirectory .openrefine
|
|
||||||
tar -xzf openrefine.tar.gz -C .openrefine --strip 1
|
|
||||||
rm openrefine.tar.gz
|
|
||||||
- | # optimize OpenRefine for batch processing
|
- | # optimize OpenRefine for batch processing
|
||||||
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file
|
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' "${OPENREFINE_APPDIR}/refine" # fix path issue in OpenRefine startup file
|
||||||
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser
|
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "${OPENREFINE_APPDIR}/refine.ini" # do not try to open OpenRefine in browser
|
||||||
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
|
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "${OPENREFINE_APPDIR}/refine.ini" # set autosave period from 5 minutes to 25 hours
|
||||||
- > # download openrefine-client into subdirectory .openrefine
|
- | # install openrefine-client into subdirectory ${OPENREFINE_APPDIR}
|
||||||
wget --no-verbose -O .openrefine/client
|
wget --no-verbose -O "${OPENREFINE_APPDIR}/client" https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
|
||||||
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
|
chmod +x "${OPENREFINE_APPDIR}/client"
|
||||||
- chmod +x .openrefine/client # make client executable
|
|
||||||
|
|
||||||
start:
|
start:
|
||||||
dir: ./{{.DIR}}
|
- | # requirement OpenRefine
|
||||||
cmds:
|
if [ ! -f "${OPENREFINE_APPDIR}/refine" ]; then
|
||||||
- | # verify that OpenRefine is installed
|
|
||||||
if [ ! -f "$OPENREFINE" ]; then
|
|
||||||
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
echo 1>&2 "OpenRefine missing; try task install"; exit 1
|
||||||
fi
|
fi
|
||||||
- | # delete temporary files and log file of previous run
|
- | # launch OpenRefine with specific data directory and redirect its output to a log file
|
||||||
rm -rf ./*.project* workspace.json
|
"${OPENREFINE_APPDIR}/refine" -v warn -p "$OPENREFINE_PORT" -m "$OPENREFINE_MEMORY" -d "${OPENREFINE_TMPDIR}" >> "${OPENREFINE_TMPDIR}/log.txt" 2>&1 &
|
||||||
rm -rf "{{.PROJECT}}.log"
|
|
||||||
- > # launch OpenRefine with specific data directory and redirect its output to a log file
|
|
||||||
"$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}}
|
|
||||||
-d ../{{.DIR}}
|
|
||||||
>> "{{.PROJECT}}.log" 2>&1 &
|
|
||||||
- | # wait until OpenRefine API is available
|
- | # wait until OpenRefine API is available
|
||||||
timeout 30s bash -c "until
|
timeout 30s bash -c "until wget -q -O - -o /dev/null http://localhost:${OPENREFINE_PORT} | cat | grep -q -o OpenRefine; do sleep 1; done"
|
||||||
wget -q -O - -o /dev/null http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
|
|
||||||
do sleep 1
|
|
||||||
done"
|
|
||||||
|
|
||||||
stop:
|
import:
|
||||||
dir: ./{{.DIR}}
|
- | # import (requires absolute path)
|
||||||
cmds:
|
"${OPENREFINE_APPDIR}/client" \
|
||||||
- | # shut down OpenRefine gracefully
|
--create "$(readlink -m input/duplicates.csv)" \
|
||||||
PID=$(lsof -t -i:{{.PORT}})
|
--projectName myproject \
|
||||||
kill $PID
|
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||||
while ps -p $PID > /dev/null; do sleep 1; done
|
|
||||||
- > # archive the OpenRefine project
|
|
||||||
tar cfz
|
|
||||||
"{{.PROJECT}}.openrefine.tar.gz"
|
|
||||||
-C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1)
|
|
||||||
.
|
|
||||||
- rm -rf ./*.project* workspace.json # delete temporary files
|
|
||||||
|
|
||||||
kill:
|
transform:
|
||||||
dir: ./{{.DIR}}
|
- | # apply undo/redo history
|
||||||
cmds:
|
for f in config/*.json; do
|
||||||
- | # shut down OpenRefine immediately to save time and disk space
|
"${OPENREFINE_APPDIR}/client" myproject --apply "$f" \
|
||||||
PID=$(lsof -t -i:{{.PORT}})
|
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||||
kill -9 $PID
|
done
|
||||||
while ps -p $PID > /dev/null; do sleep 1; done
|
|
||||||
- rm -rf ./*.project* workspace.json # delete temporary files
|
export:
|
||||||
|
- mkdir -p output
|
||||||
|
- | # export to TSV
|
||||||
|
"${OPENREFINE_APPDIR}/client" myproject \
|
||||||
|
--output "$(readlink -m output/deduped.tsv)" \
|
||||||
|
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||||
|
|
||||||
|
stats:
|
||||||
|
- | # print RAM and CPU usage
|
||||||
|
PID="$(lsof -t -i:${OPENREFINE_PORT})"
|
||||||
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" \
|
||||||
|
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||||
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" \
|
||||||
|
> >(tee -a "${OPENREFINE_TMPDIR}/log.txt") 2>&1
|
||||||
|
|
||||||
check:
|
check:
|
||||||
desc: check OpenRefine log for any warnings and exit on error
|
- | # check log file for any warnings
|
||||||
dir: ./{{.DIR}}
|
if grep -i 'exception\|error' "${OPENREFINE_TMPDIR}/log.txt"
|
||||||
cmds:
|
then echo 1>&2 "log contains warnings!"; echo; cat "${OPENREFINE_TMPDIR}/log.txt"; exit 1
|
||||||
- | # find log file(s) and check for "exception" or "error"
|
|
||||||
if grep -i 'exception\|error' $(find . -name '*.log'); then
|
|
||||||
echo 1>&2 "log contains warnings!"; exit 1
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
stop:
|
||||||
|
- | # shut down OpenRefine gracefully
|
||||||
|
PID="$(lsof -t -i:${OPENREFINE_PORT})"
|
||||||
|
kill $PID; while ps -p $PID > /dev/null; do sleep 1; done
|
||||||
|
|
||||||
|
kill:
|
||||||
|
- | # shut down OpenRefine immediately to save time
|
||||||
|
PID="$(lsof -t -i:${OPENREFINE_PORT})"
|
||||||
|
kill -9 $PID
|
||||||
|
|
||||||
|
cleanup: rm -rf "${OPENREFINE_TMPDIR}"
|
||||||
|
|
||||||
|
git:
|
||||||
|
desc: commit and push if something changed
|
||||||
|
cmds:
|
||||||
|
- git add -A
|
||||||
|
- git commit -m "latest change $(date -u)" || exit 0
|
||||||
|
- git push
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
{"cells":[{"metadata":{},"cell_type":"markdown","source":"## Run all tasks in parallel"},{"metadata":{"trusted":true},"cell_type":"code","source":"task","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Run a specific task"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Run some tasks in parallel"},{"metadata":{"trusted":true},"cell_type":"code","source":"task --parallel example-duplicates:main example-doaj:main","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Force run a task even when the task is up-to-date"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main --force","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Dry-run in verbose mode for debugging"},{"metadata":{"trusted":true},"cell_type":"code","source":"task example-duplicates:main --dry --verbose --force","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## List available tasks"},{"metadata":{"trusted":true},"cell_type":"code","source":"task --list","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"bash","display_name":"Bash","language":"bash"},"language_info":{"name":"bash","codemirror_mode":"shell","mimetype":"text/x-sh","file_extension":".sh"}},"nbformat":4,"nbformat_minor":5}
|
|
|
@ -1,70 +0,0 @@
|
||||||
version: '3'
|
|
||||||
|
|
||||||
tasks:
|
|
||||||
main:
|
|
||||||
desc: Library Carpentry Lesson covering DOAJ
|
|
||||||
vars:
|
|
||||||
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
|
|
||||||
cmds:
|
|
||||||
- task: refine
|
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
|
||||||
vars: {DIR: '{{.DIR}}'}
|
|
||||||
|
|
||||||
refine:
|
|
||||||
dir: ./{{.DIR}}
|
|
||||||
vars:
|
|
||||||
DIR: '{{splitList ":" .TASK | first}}'
|
|
||||||
PROJECT: doaj
|
|
||||||
PORT: 3334 # assign a different port for each project
|
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
|
||||||
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
|
|
||||||
deps:
|
|
||||||
- task: download # will be executed each run independent of up-to-date check
|
|
||||||
cmds:
|
|
||||||
- task: :start # launch OpenRefine
|
|
||||||
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
|
||||||
- > # import file
|
|
||||||
"$CLIENT" -P {{.PORT}}
|
|
||||||
--create "$(readlink -m input/doaj-article-sample.csv)"
|
|
||||||
--projectName "{{.PROJECT}}"
|
|
||||||
> {{.LOG}}
|
|
||||||
- > # apply transformation rules
|
|
||||||
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
|
|
||||||
--apply config/doaj-openrefine.json
|
|
||||||
> {{.LOG}}
|
|
||||||
- mkdir -p output
|
|
||||||
- > # export to file
|
|
||||||
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
|
|
||||||
--output "$(readlink -m output/doaj-results.tsv)"
|
|
||||||
> {{.LOG}}
|
|
||||||
- | # print allocated system resources
|
|
||||||
PID="$(lsof -t -i:{{.PORT}})"
|
|
||||||
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
|
|
||||||
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
|
|
||||||
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
|
||||||
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
|
||||||
sources:
|
|
||||||
- Taskfile.yml
|
|
||||||
- input/**
|
|
||||||
- config/**
|
|
||||||
generates:
|
|
||||||
- ./{{.PROJECT}}.openrefine.tar.gz
|
|
||||||
- output/**
|
|
||||||
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
|
|
||||||
|
|
||||||
download:
|
|
||||||
dir: ./{{.DIR}}
|
|
||||||
vars:
|
|
||||||
DIR: '{{splitList ":" .TASK | first}}'
|
|
||||||
cmds:
|
|
||||||
- mkdir -p input config
|
|
||||||
- > # Download input
|
|
||||||
wget --no-verbose -O input/doaj-article-sample.csv
|
|
||||||
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-article-sample.csv
|
|
||||||
- > # Download config
|
|
||||||
wget --no-verbose -O config/doaj-openrefine.json
|
|
||||||
https://github.com/felixlohmeier/openrefine-kimws2019/raw/master/doaj-openrefine.json
|
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
|
||||||
cmds:
|
|
||||||
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|
|
|
@ -1,56 +0,0 @@
|
||||||
version: '3'
|
|
||||||
|
|
||||||
tasks:
|
|
||||||
main:
|
|
||||||
desc: Removing duplicates in a very small test dataset
|
|
||||||
vars:
|
|
||||||
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
|
|
||||||
cmds:
|
|
||||||
- task: refine
|
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
|
||||||
vars: {DIR: '{{.DIR}}'}
|
|
||||||
|
|
||||||
refine:
|
|
||||||
dir: ./{{.DIR}}
|
|
||||||
vars:
|
|
||||||
DIR: '{{splitList ":" .TASK | first}}'
|
|
||||||
PROJECT: duplicates
|
|
||||||
PORT: 3335 # assign a different port for each project
|
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
|
||||||
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
|
|
||||||
cmds:
|
|
||||||
- task: :start # launch OpenRefine
|
|
||||||
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
|
||||||
- > # import file
|
|
||||||
"$CLIENT" -P {{.PORT}}
|
|
||||||
--create "$(readlink -m input/duplicates.csv)"
|
|
||||||
--encoding UTF-8
|
|
||||||
--projectName "{{.PROJECT}}"
|
|
||||||
> {{.LOG}}
|
|
||||||
- > # apply transformation rules
|
|
||||||
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
|
|
||||||
--apply config/duplicates-deletion.json
|
|
||||||
> {{.LOG}}
|
|
||||||
- mkdir -p output
|
|
||||||
- > # export to file
|
|
||||||
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
|
|
||||||
--output "$(readlink -m output/deduped.xls)"
|
|
||||||
> {{.LOG}}
|
|
||||||
- | # print allocated system resources
|
|
||||||
PID="$(lsof -t -i:{{.PORT}})"
|
|
||||||
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
|
|
||||||
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
|
|
||||||
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
|
||||||
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
|
||||||
sources:
|
|
||||||
- Taskfile.yml
|
|
||||||
- input/**
|
|
||||||
- config/**
|
|
||||||
generates:
|
|
||||||
- ./{{.PROJECT}}.openrefine.tar.gz
|
|
||||||
- output/**
|
|
||||||
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
|
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
|
||||||
cmds:
|
|
||||||
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|
|
|
@ -1,72 +0,0 @@
|
||||||
version: '3'
|
|
||||||
|
|
||||||
tasks:
|
|
||||||
main:
|
|
||||||
desc: Powerhouse Museum Tutorial
|
|
||||||
vars:
|
|
||||||
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
|
|
||||||
cmds:
|
|
||||||
- task: refine
|
|
||||||
- task: :check # check OpenRefine log for any warnings and exit on error
|
|
||||||
vars: {DIR: '{{.DIR}}'}
|
|
||||||
|
|
||||||
refine:
|
|
||||||
dir: ./{{.DIR}}
|
|
||||||
vars:
|
|
||||||
DIR: '{{splitList ":" .TASK | first}}'
|
|
||||||
PROJECT: phm
|
|
||||||
PORT: 3336 # assign a different port for each project
|
|
||||||
RAM: 2048M # maximum RAM for OpenRefine java heap space
|
|
||||||
LOG: '>(tee -a "{{.PROJECT}}.log") 2>&1' # be careful when making changes here, as the path to the log file should match the server log (see main task "start")
|
|
||||||
deps:
|
|
||||||
- task: download # will be executed each run independent of up-to-date check
|
|
||||||
cmds:
|
|
||||||
- task: :start # launch OpenRefine
|
|
||||||
vars: {DIR: '{{.DIR}}', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
|
||||||
- > # import file
|
|
||||||
"$CLIENT" -P {{.PORT}}
|
|
||||||
--create "$(readlink -m input/phm-collection.tsv)"
|
|
||||||
--processQuotes false
|
|
||||||
--guessCellValueTypes true
|
|
||||||
--projectName "{{.PROJECT}}"
|
|
||||||
> {{.LOG}}
|
|
||||||
- > # apply transformation rules
|
|
||||||
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
|
|
||||||
--apply config/phm-transform.json
|
|
||||||
> {{.LOG}}
|
|
||||||
- mkdir -p output
|
|
||||||
- > # export to file
|
|
||||||
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
|
|
||||||
--output "$(readlink -m output/phm-results.tsv)"
|
|
||||||
> {{.LOG}}
|
|
||||||
- | # print allocated system resources
|
|
||||||
PID="$(lsof -t -i:{{.PORT}})"
|
|
||||||
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
|
|
||||||
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
|
|
||||||
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
|
||||||
vars: {DIR: '{{.DIR}}', PORT: '{{.PORT}}', PROJECT: '{{.PROJECT}}'}
|
|
||||||
sources:
|
|
||||||
- Taskfile.yml
|
|
||||||
- input/**
|
|
||||||
- config/**
|
|
||||||
generates:
|
|
||||||
- ./{{.PROJECT}}.openrefine.tar.gz
|
|
||||||
- output/**
|
|
||||||
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
|
|
||||||
|
|
||||||
download:
|
|
||||||
dir: ./{{.DIR}}
|
|
||||||
vars:
|
|
||||||
DIR: '{{splitList ":" .TASK | first}}'
|
|
||||||
cmds:
|
|
||||||
- mkdir -p input config
|
|
||||||
- > # Download input
|
|
||||||
wget --no-verbose -O input/phm-collection.tsv
|
|
||||||
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/input/phm-collection.tsv
|
|
||||||
- > # Download config
|
|
||||||
wget --no-verbose -O config/phm-transform.json
|
|
||||||
https://github.com/opencultureconsulting/openrefine-batch/raw/master/examples/powerhouse-museum/config/phm-transform.json
|
|
||||||
|
|
||||||
default: # enable standalone execution (running `task` in project directory)
|
|
||||||
cmds:
|
|
||||||
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|
|
Loading…
Reference in New Issue