release v1.1, supports OpenRefine 2.7
This commit is contained in:
parent
246fbb72b2
commit
4c6cd7dbd4
94
README.md
94
README.md
|
@ -64,7 +64,7 @@ clone or [download GitHub repository](https://github.com/felixlohmeier/openrefin
|
||||||
### Help Screen
|
### Help Screen
|
||||||
|
|
||||||
```
|
```
|
||||||
[18:20 felix ~/openrefine-batch]$ ./openrefine-batch.sh
|
[14:45 felix ~/openrefine-batch]$ ./openrefine-batch.sh
|
||||||
Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||||
|
|
||||||
== basic arguments ==
|
== basic arguments ==
|
||||||
|
@ -106,11 +106,10 @@ Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||||
|
|
||||||
== example ==
|
== example ==
|
||||||
|
|
||||||
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true
|
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX
|
||||||
|
|
||||||
clone or download GitHub repository to get example data:
|
clone or download GitHub repository to get example data:
|
||||||
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Logging
|
### Logging
|
||||||
|
@ -118,14 +117,17 @@ https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
||||||
The script prints log messages from OpenRefine server and makes use of `ps` to show statistics for each step. Here is a sample:
|
The script prints log messages from OpenRefine server and makes use of `ps` to show statistics for each step. Here is a sample:
|
||||||
|
|
||||||
```
|
```
|
||||||
[17:55 felix ~/openrefine-batch]$ ./openrefine-batch.sh \
|
[14:46 felix ~/openrefine-batch]$ ./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX
|
||||||
> -a examples/powerhouse-museum/input/ \
|
Download OpenRefine...
|
||||||
> -b examples/powerhouse-museum/config/ \
|
openrefine-linux-2.7.tar.g 100%[=====================================>] 60,23M 8,89MB/s in 11s
|
||||||
> -c examples/powerhouse-museum/output/ \
|
Install OpenRefine in subdirectory openrefine...
|
||||||
> -f tsv \
|
Total bytes read: 72990720 (70MiB, 136MiB/s)
|
||||||
> -i processQuotes=false \
|
|
||||||
> -i guessCellValueTypes=true \
|
Download OpenRefine client...
|
||||||
> -RX
|
v0.3.1.tar.gz [ <=> ] 563,12K 1015KB/s in 0,6s
|
||||||
|
Install OpenRefine client in subdirectory openrefine-client...
|
||||||
|
Total bytes read: 3082240 (3,0MiB, 90MiB/s)
|
||||||
|
|
||||||
Input directory: /home/felix/openrefine-batch/examples/powerhouse-museum/input
|
Input directory: /home/felix/openrefine-batch/examples/powerhouse-museum/input
|
||||||
Input files: phm-collection.tsv
|
Input files: phm-collection.tsv
|
||||||
Input format: --format=tsv
|
Input format: --format=tsv
|
||||||
|
@ -143,80 +145,80 @@ restart after transform: false
|
||||||
|
|
||||||
=== 1. Launch OpenRefine ===
|
=== 1. Launch OpenRefine ===
|
||||||
|
|
||||||
starting time: Di 14. Mär 17:58:08 CET 2017
|
starting time: Di 20. Jun 13:51:06 CEST 2017
|
||||||
|
|
||||||
Starting OpenRefine at 'http://127.0.0.1:3333/'
|
Starting OpenRefine at 'http://127.0.0.1:3333/'
|
||||||
|
|
||||||
17:58:08.758 [ refine_server] Starting Server bound to '127.0.0.1:3333' (0ms)
|
13:51:06.727 [ refine_server] Starting Server bound to '127.0.0.1:3333' (0ms)
|
||||||
17:58:08.760 [ refine_server] refine.memory size: 2048M JVM Max heap: 1908932608 (2ms)
|
13:51:06.728 [ refine_server] refine.memory size: 2048M JVM Max heap: 1908932608 (1ms)
|
||||||
17:58:08.787 [ refine_server] Initializing context: '/' from '/home/felix/openrefine-batch/openrefine/webapp' (27ms)
|
13:51:06.737 [ refine_server] Initializing context: '/' from '/home/felix/openrefine-batch/openrefine/webapp' (9ms)
|
||||||
17:58:09.463 [ refine] Starting OpenRefine 2.7-rc.1 [TRUNK]... (676ms)
|
13:51:06.973 [ refine] Starting OpenRefine 2.7 [TRUNK]... (236ms)
|
||||||
17:58:09.476 [ FileProjectManager] Failed to load workspace from any attempted alternatives. (13ms)
|
13:51:06.978 [ FileProjectManager] Failed to load workspace from any attempted alternatives. (5ms)
|
||||||
17:58:12.003 [ refine] Running in headless mode (2527ms)
|
13:51:09.377 [ refine] Running in headless mode (2399ms)
|
||||||
|
|
||||||
=== 2. Import all files ===
|
=== 2. Import all files ===
|
||||||
|
|
||||||
starting time: Di 14. Mär 17:58:12 CET 2017
|
starting time: Di 20. Jun 13:51:09 CEST 2017
|
||||||
|
|
||||||
import phm-collection.tsv...
|
import phm-collection.tsv...
|
||||||
17:58:13.068 [ refine] POST /command/core/create-project-from-upload (1065ms)
|
13:51:09.900 [ refine] POST /command/core/create-project-from-upload (523ms)
|
||||||
New project: 2073385535316
|
New project: 2034248478869
|
||||||
17:58:26.543 [ refine] GET /command/core/get-rows (13475ms)
|
13:51:14.110 [ refine] GET /command/core/get-rows (4210ms)
|
||||||
Number of rows: 75814
|
Number of rows: 75814
|
||||||
STARTED ELAPSED %MEM %CPU RSS
|
STARTED ELAPSED %MEM %CPU RSS
|
||||||
17:58:07 00:18 9.8 168 795024
|
13:51:05 00:08 5.3 191 864692
|
||||||
|
|
||||||
=== 3. Prepare transform & export ===
|
=== 3. Prepare transform & export ===
|
||||||
|
|
||||||
starting time: Di 14. Mär 17:58:26 CET 2017
|
starting time: Di 20. Jun 13:51:14 CEST 2017
|
||||||
|
|
||||||
get project ids...
|
get project ids...
|
||||||
17:58:26.778 [ refine] GET /command/core/get-all-project-metadata (235ms)
|
13:51:14.207 [ refine] GET /command/core/get-all-project-metadata (97ms)
|
||||||
2073385535316: phm-collection.tsv
|
2034248478869: phm-collection.tsv
|
||||||
|
|
||||||
=== 4. Transform phm-collection.tsv ===
|
=== 4. Transform phm-collection.tsv ===
|
||||||
|
|
||||||
starting time: Di 14. Mär 17:58:26 CET 2017
|
starting time: Di 20. Jun 13:51:14 CEST 2017
|
||||||
|
|
||||||
transform phm-transform.json...
|
transform phm-transform.json...
|
||||||
17:58:26.917 [ refine] GET /command/core/get-models (139ms)
|
13:51:14.265 [ refine] GET /command/core/get-models (58ms)
|
||||||
17:58:26.934 [ refine] POST /command/core/apply-operations (17ms)
|
13:51:14.273 [ refine] POST /command/core/apply-operations (8ms)
|
||||||
STARTED ELAPSED %MEM %CPU RSS
|
STARTED ELAPSED %MEM %CPU RSS
|
||||||
17:58:07 01:02 13.5 134 1096916
|
13:51:05 00:23 7.0 155 1142712
|
||||||
|
|
||||||
|
|
||||||
=== 5. Export phm-collection.tsv ===
|
=== 5. Export phm-collection.tsv ===
|
||||||
|
|
||||||
starting time: Di 14. Mär 17:59:09 CET 2017
|
starting time: Di 20. Jun 13:51:29 CEST 2017
|
||||||
|
|
||||||
export to file phm-collection.tsv...
|
export to file phm-collection.tsv...
|
||||||
17:59:09.944 [ refine] GET /command/core/get-models (43010ms)
|
13:51:29.824 [ refine] GET /command/core/get-models (15551ms)
|
||||||
17:59:09.956 [ refine] GET /command/core/get-all-project-metadata (12ms)
|
13:51:29.827 [ refine] GET /command/core/get-all-project-metadata (3ms)
|
||||||
17:59:09.967 [ refine] POST /command/core/export-rows/phm-collection.tsv.tsv (11ms)
|
13:51:29.841 [ refine] POST /command/core/export-rows/phm-collection.tsv.tsv (14ms)
|
||||||
STARTED ELAPSED %MEM %CPU RSS
|
STARTED ELAPSED %MEM %CPU RSS
|
||||||
17:58:07 02:24 13.5 60.5 1098056
|
13:51:05 00:49 7.0 75.7 1144808
|
||||||
|
|
||||||
|
|
||||||
output (number of lines / size in bytes):
|
output (number of lines / size in bytes):
|
||||||
167017 60527726 /home/felix/openrefine-batch/examples/powerhouse-museum/output/phm-collection.tsv
|
167017 60527726 /home/felix/openrefine-batch/examples/powerhouse-museum/output/phm-collection.tsv
|
||||||
|
|
||||||
cleanup...
|
cleanup...
|
||||||
18:00:35.425 [ ProjectManager] Saving all modified projects ... (85458ms)
|
13:51:55.783 [ ProjectManager] Saving all modified projects ... (25942ms)
|
||||||
18:00:42.357 [ project_utilities] Saved project '2073385535316' (6932ms)
|
13:51:58.324 [ project_utilities] Saved project '2034248478869' (2541ms)
|
||||||
|
|
||||||
=== Statistics ===
|
=== Statistics ===
|
||||||
|
|
||||||
starting time and run time of each step:
|
starting time and run time of each step:
|
||||||
Start process Di 14. Mär 17:58:08 CET 2017 (00:00:00)
|
Start process Di 20. Jun 13:51:06 CEST 2017 (00:00:00)
|
||||||
Launch OpenRefine Di 14. Mär 17:58:08 CET 2017 (00:00:04)
|
Launch OpenRefine Di 20. Jun 13:51:06 CEST 2017 (00:00:03)
|
||||||
Import all files Di 14. Mär 17:58:12 CET 2017 (00:00:14)
|
Import all files Di 20. Jun 13:51:09 CEST 2017 (00:00:05)
|
||||||
Prepare transform & export Di 14. Mär 17:58:26 CET 2017 (00:00:00)
|
Prepare transform & export Di 20. Jun 13:51:14 CEST 2017 (00:00:00)
|
||||||
Transform phm-collection.tsv Di 14. Mär 17:58:26 CET 2017 (00:00:43)
|
Transform phm-collection.tsv Di 20. Jun 13:51:14 CEST 2017 (00:00:15)
|
||||||
Export phm-collection.tsv Di 14. Mär 17:59:09 CET 2017 (00:01:34)
|
Export phm-collection.tsv Di 20. Jun 13:51:29 CEST 2017 (00:00:30)
|
||||||
End process Di 14. Mär 18:00:43 CET 2017 (00:00:00)
|
End process Di 20. Jun 13:51:59 CEST 2017 (00:00:00)
|
||||||
|
|
||||||
total run time: 00:02:35 (hh:mm:ss)
|
total run time: 00:00:53 (hh:mm:ss)
|
||||||
highest memory load: 1072 MB
|
highest memory load: 1117 MB
|
||||||
```
|
```
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# openrefine-batch.sh, Felix Lohmeier, v1.0, 14.03.2017
|
# openrefine-batch.sh, Felix Lohmeier, v1.1, 2017-06-20
|
||||||
# https://github.com/felixlohmeier/openrefine-batch
|
# https://github.com/felixlohmeier/openrefine-batch
|
||||||
|
|
||||||
# check system requirements
|
# check system requirements
|
||||||
|
@ -17,7 +17,7 @@ fi
|
||||||
# help screen
|
# help screen
|
||||||
function usage () {
|
function usage () {
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
Usage: ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
Usage: sudo ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||||
|
|
||||||
== basic arguments ==
|
== basic arguments ==
|
||||||
-a INPUTDIR path to directory with source files (leave empty to transform only ; multiple files may be imported into a single project by providing a zip or tar.gz archive, cf. https://github.com/OpenRefine/OpenRefine/wiki/Importers )
|
-a INPUTDIR path to directory with source files (leave empty to transform only ; multiple files may be imported into a single project by providing a zip or tar.gz archive, cf. https://github.com/OpenRefine/OpenRefine/wiki/Importers )
|
||||||
|
@ -29,7 +29,7 @@ Usage: ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDI
|
||||||
-f INPUTFORMAT (csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
|
-f INPUTFORMAT (csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
|
||||||
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
||||||
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
||||||
-v VERSION OpenRefine version (2.7rc2, 2.7rc1, 2.6rc2, 2.6rc1, dev; default: 2.7rc2)
|
-v VERSION OpenRefine version (2.7, 2.7rc2, 2.7rc1, 2.6rc2, 2.6rc1, dev; default: 2.7)
|
||||||
-E do NOT export files
|
-E do NOT export files
|
||||||
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
||||||
-X do NOT restart OpenRefine after each project (e.g. input file)
|
-X do NOT restart OpenRefine after each project (e.g. input file)
|
||||||
|
@ -58,13 +58,14 @@ Usage: ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDI
|
||||||
|
|
||||||
== example ==
|
== example ==
|
||||||
|
|
||||||
./openrefine-batch-docker.sh \
|
sudo ./openrefine-batch-docker.sh \
|
||||||
-a examples/powerhouse-museum/input/ \
|
-a examples/powerhouse-museum/input/ \
|
||||||
-b examples/powerhouse-museum/config/ \
|
-b examples/powerhouse-museum/config/ \
|
||||||
-c examples/powerhouse-museum/output/ \
|
-c examples/powerhouse-museum/output/ \
|
||||||
-f tsv \
|
-f tsv \
|
||||||
-i processQuotes=false \
|
-i processQuotes=false \
|
||||||
-i guessCellValueTypes=true
|
-i guessCellValueTypes=true \
|
||||||
|
-RX
|
||||||
|
|
||||||
clone or download GitHub repository to get example data:
|
clone or download GitHub repository to get example data:
|
||||||
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
||||||
|
@ -75,7 +76,7 @@ EOF
|
||||||
|
|
||||||
# defaults
|
# defaults
|
||||||
ram="2048M"
|
ram="2048M"
|
||||||
version="2.7rc2"
|
version="2.7"
|
||||||
restartfile="true"
|
restartfile="true"
|
||||||
restarttransform="true"
|
restarttransform="true"
|
||||||
export="true"
|
export="true"
|
||||||
|
@ -160,6 +161,7 @@ echo ""
|
||||||
checkpoints=${#checkpointdate[@]}
|
checkpoints=${#checkpointdate[@]}
|
||||||
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
||||||
checkpointname[$(($checkpoints + 1))]="Start process"
|
checkpointname[$(($checkpoints + 1))]="Start process"
|
||||||
|
memoryload=()
|
||||||
|
|
||||||
# launch server
|
# launch server
|
||||||
checkpoints=${#checkpointdate[@]}
|
checkpoints=${#checkpointdate[@]}
|
||||||
|
@ -169,7 +171,7 @@ echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ==="
|
||||||
echo ""
|
echo ""
|
||||||
echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})"
|
echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})"
|
||||||
echo ""
|
echo ""
|
||||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||||
# wait until server is available
|
# wait until server is available
|
||||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||||
# show server logs
|
# show server logs
|
||||||
|
@ -188,16 +190,17 @@ if [ -n "$inputfiles" ]; then
|
||||||
for inputfile in "${inputfiles[@]}" ; do
|
for inputfile in "${inputfiles[@]}" ; do
|
||||||
echo "import ${inputfile}..."
|
echo "import ${inputfile}..."
|
||||||
# run client with input command
|
# run client with input command
|
||||||
docker run --rm --link ${uuid} -v ${inputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]}
|
docker run --rm --link ${uuid} -v ${inputdir}:/data:z felixlohmeier/openrefine-client -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]}
|
||||||
# show allocated system resources
|
# show allocated system resources
|
||||||
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
||||||
|
memoryload+=($(ps --no-headers -o rss -C java))
|
||||||
echo ""
|
echo ""
|
||||||
# restart server to clear memory
|
# restart server to clear memory
|
||||||
if [ "$restartfile" = "true" ]; then
|
if [ "$restartfile" = "true" ]; then
|
||||||
echo "save project and restart OpenRefine server..."
|
echo "save project and restart OpenRefine server..."
|
||||||
docker stop -t=5000 ${uuid}
|
docker stop -t=5000 ${uuid}
|
||||||
docker rm ${uuid}
|
docker rm ${uuid}
|
||||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||||
docker attach ${uuid} &
|
docker attach ${uuid} &
|
||||||
echo ""
|
echo ""
|
||||||
|
@ -232,7 +235,7 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
||||||
echo "restart OpenRefine server to advertise copied projects..."
|
echo "restart OpenRefine server to advertise copied projects..."
|
||||||
docker stop -t=5000 ${uuid}
|
docker stop -t=5000 ${uuid}
|
||||||
docker rm ${uuid}
|
docker rm ${uuid}
|
||||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||||
docker attach ${uuid} &
|
docker attach ${uuid} &
|
||||||
echo ""
|
echo ""
|
||||||
|
@ -253,16 +256,17 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
||||||
for jsonfile in "${jsonfiles[@]}" ; do
|
for jsonfile in "${jsonfiles[@]}" ; do
|
||||||
echo "transform ${jsonfile}..."
|
echo "transform ${jsonfile}..."
|
||||||
# run client with apply command
|
# run client with apply command
|
||||||
docker run --rm --link ${uuid} -v ${configdir}:/data felixlohmeier/openrefine-client -H ${uuid} -f ${jsonfile} ${projectids[i]}
|
docker run --rm --link ${uuid} -v ${configdir}:/data:z felixlohmeier/openrefine-client -H ${uuid} -f ${jsonfile} ${projectids[i]}
|
||||||
# allocated system resources
|
# allocated system resources
|
||||||
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
||||||
|
memoryload+=($(ps --no-headers -o rss -C java))
|
||||||
echo ""
|
echo ""
|
||||||
# restart server to clear memory
|
# restart server to clear memory
|
||||||
if [ "$restarttransform" = "true" ]; then
|
if [ "$restarttransform" = "true" ]; then
|
||||||
echo "save project and restart OpenRefine server..."
|
echo "save project and restart OpenRefine server..."
|
||||||
docker stop -t=5000 ${uuid}
|
docker stop -t=5000 ${uuid}
|
||||||
docker rm ${uuid}
|
docker rm ${uuid}
|
||||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||||
docker attach ${uuid} &
|
docker attach ${uuid} &
|
||||||
fi
|
fi
|
||||||
|
@ -283,9 +287,10 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
||||||
filename=${projectnames[i]%.*}
|
filename=${projectnames[i]%.*}
|
||||||
echo "export to file ${filename}.tsv..."
|
echo "export to file ${filename}.tsv..."
|
||||||
# run client with export command
|
# run client with export command
|
||||||
docker run --rm --link ${uuid} -v ${outputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -E --output="${filename}.tsv" ${projectids[i]}
|
docker run --rm --link ${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine-client -H ${uuid} -E --output="${filename}.tsv" ${projectids[i]}
|
||||||
# show allocated system resources
|
# show allocated system resources
|
||||||
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
||||||
|
memoryload+=($(ps --no-headers -o rss -C java))
|
||||||
echo ""
|
echo ""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -294,7 +299,7 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
||||||
echo "restart OpenRefine server..."
|
echo "restart OpenRefine server..."
|
||||||
docker stop -t=5000 ${uuid}
|
docker stop -t=5000 ${uuid}
|
||||||
docker rm ${uuid}
|
docker rm ${uuid}
|
||||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||||
docker attach ${uuid} &
|
docker attach ${uuid} &
|
||||||
fi
|
fi
|
||||||
|
@ -337,3 +342,10 @@ done
|
||||||
echo ""
|
echo ""
|
||||||
diffsec="$((${checkpointdate[$checkpoints]} - ${checkpointdate[1]}))"
|
diffsec="$((${checkpointdate[$checkpoints]} - ${checkpointdate[1]}))"
|
||||||
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"
|
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"
|
||||||
|
|
||||||
|
# calculate and print memory load
|
||||||
|
max=${memoryload[0]}
|
||||||
|
for n in "${memoryload[@]}" ; do
|
||||||
|
((n > max)) && max=$n
|
||||||
|
done
|
||||||
|
echo "highest memory load: $(($max / 1024)) MB"
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# openrefine-batch.sh, Felix Lohmeier, v1.0.1, 15.03.2017
|
# openrefine-batch.sh, Felix Lohmeier, v1.1, 2017-06-20
|
||||||
# https://github.com/felixlohmeier/openrefine-batch
|
# https://github.com/felixlohmeier/openrefine-batch
|
||||||
|
|
||||||
# declare download URLs for OpenRefine and OpenRefine client
|
# declare download URLs for OpenRefine and OpenRefine client
|
||||||
openrefine_URL="https://github.com/OpenRefine/OpenRefine/releases/download/2.7-rc.2/openrefine-linux-2.7-rc.2.tar.gz"
|
openrefine_URL="https://github.com/OpenRefine/OpenRefine/releases/download/2.7/openrefine-linux-2.7.tar.gz"
|
||||||
client_URL="https://github.com/felixlohmeier/openrefine-client/archive/v0.3.1.tar.gz"
|
client_URL="https://github.com/felixlohmeier/openrefine-client/archive/v0.3.1.tar.gz"
|
||||||
|
|
||||||
# check system requirements
|
# check system requirements
|
||||||
|
@ -98,7 +98,8 @@ Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||||
-c examples/powerhouse-museum/output/ \
|
-c examples/powerhouse-museum/output/ \
|
||||||
-f tsv \
|
-f tsv \
|
||||||
-i processQuotes=false \
|
-i processQuotes=false \
|
||||||
-i guessCellValueTypes=true
|
-i guessCellValueTypes=true \
|
||||||
|
-RX
|
||||||
|
|
||||||
clone or download GitHub repository to get example data:
|
clone or download GitHub repository to get example data:
|
||||||
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
||||||
|
@ -376,6 +377,7 @@ done
|
||||||
echo ""
|
echo ""
|
||||||
diffsec="$((${checkpointdate[$checkpoints]} - ${checkpointdate[1]}))"
|
diffsec="$((${checkpointdate[$checkpoints]} - ${checkpointdate[1]}))"
|
||||||
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"
|
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"
|
||||||
|
|
||||||
# calculate and print memory load
|
# calculate and print memory load
|
||||||
max=${memoryload[0]}
|
max=${memoryload[0]}
|
||||||
for n in "${memoryload[@]}" ; do
|
for n in "${memoryload[@]}" ; do
|
||||||
|
|
Loading…
Reference in New Issue