release v1.1, supports OpenRefine 2.7
This commit is contained in:
parent
246fbb72b2
commit
4c6cd7dbd4
94
README.md
94
README.md
|
@ -64,7 +64,7 @@ clone or [download GitHub repository](https://github.com/felixlohmeier/openrefin
|
|||
### Help Screen
|
||||
|
||||
```
|
||||
[18:20 felix ~/openrefine-batch]$ ./openrefine-batch.sh
|
||||
[14:45 felix ~/openrefine-batch]$ ./openrefine-batch.sh
|
||||
Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||
|
||||
== basic arguments ==
|
||||
|
@ -106,11 +106,10 @@ Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
|||
|
||||
== example ==
|
||||
|
||||
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true
|
||||
./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX
|
||||
|
||||
clone or download GitHub repository to get example data:
|
||||
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
||||
|
||||
```
|
||||
|
||||
### Logging
|
||||
|
@ -118,14 +117,17 @@ https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
|||
The script prints log messages from OpenRefine server and makes use of `ps` to show statistics for each step. Here is a sample:
|
||||
|
||||
```
|
||||
[17:55 felix ~/openrefine-batch]$ ./openrefine-batch.sh \
|
||||
> -a examples/powerhouse-museum/input/ \
|
||||
> -b examples/powerhouse-museum/config/ \
|
||||
> -c examples/powerhouse-museum/output/ \
|
||||
> -f tsv \
|
||||
> -i processQuotes=false \
|
||||
> -i guessCellValueTypes=true \
|
||||
> -RX
|
||||
[14:46 felix ~/openrefine-batch]$ ./openrefine-batch.sh -a examples/powerhouse-museum/input/ -b examples/powerhouse-museum/config/ -c examples/powerhouse-museum/output/ -f tsv -i processQuotes=false -i guessCellValueTypes=true -RX
|
||||
Download OpenRefine...
|
||||
openrefine-linux-2.7.tar.g 100%[=====================================>] 60,23M 8,89MB/s in 11s
|
||||
Install OpenRefine in subdirectory openrefine...
|
||||
Total bytes read: 72990720 (70MiB, 136MiB/s)
|
||||
|
||||
Download OpenRefine client...
|
||||
v0.3.1.tar.gz [ <=> ] 563,12K 1015KB/s in 0,6s
|
||||
Install OpenRefine client in subdirectory openrefine-client...
|
||||
Total bytes read: 3082240 (3,0MiB, 90MiB/s)
|
||||
|
||||
Input directory: /home/felix/openrefine-batch/examples/powerhouse-museum/input
|
||||
Input files: phm-collection.tsv
|
||||
Input format: --format=tsv
|
||||
|
@ -143,80 +145,80 @@ restart after transform: false
|
|||
|
||||
=== 1. Launch OpenRefine ===
|
||||
|
||||
starting time: Di 14. Mär 17:58:08 CET 2017
|
||||
starting time: Di 20. Jun 13:51:06 CEST 2017
|
||||
|
||||
Starting OpenRefine at 'http://127.0.0.1:3333/'
|
||||
|
||||
17:58:08.758 [ refine_server] Starting Server bound to '127.0.0.1:3333' (0ms)
|
||||
17:58:08.760 [ refine_server] refine.memory size: 2048M JVM Max heap: 1908932608 (2ms)
|
||||
17:58:08.787 [ refine_server] Initializing context: '/' from '/home/felix/openrefine-batch/openrefine/webapp' (27ms)
|
||||
17:58:09.463 [ refine] Starting OpenRefine 2.7-rc.1 [TRUNK]... (676ms)
|
||||
17:58:09.476 [ FileProjectManager] Failed to load workspace from any attempted alternatives. (13ms)
|
||||
17:58:12.003 [ refine] Running in headless mode (2527ms)
|
||||
13:51:06.727 [ refine_server] Starting Server bound to '127.0.0.1:3333' (0ms)
|
||||
13:51:06.728 [ refine_server] refine.memory size: 2048M JVM Max heap: 1908932608 (1ms)
|
||||
13:51:06.737 [ refine_server] Initializing context: '/' from '/home/felix/openrefine-batch/openrefine/webapp' (9ms)
|
||||
13:51:06.973 [ refine] Starting OpenRefine 2.7 [TRUNK]... (236ms)
|
||||
13:51:06.978 [ FileProjectManager] Failed to load workspace from any attempted alternatives. (5ms)
|
||||
13:51:09.377 [ refine] Running in headless mode (2399ms)
|
||||
|
||||
=== 2. Import all files ===
|
||||
|
||||
starting time: Di 14. Mär 17:58:12 CET 2017
|
||||
starting time: Di 20. Jun 13:51:09 CEST 2017
|
||||
|
||||
import phm-collection.tsv...
|
||||
17:58:13.068 [ refine] POST /command/core/create-project-from-upload (1065ms)
|
||||
New project: 2073385535316
|
||||
17:58:26.543 [ refine] GET /command/core/get-rows (13475ms)
|
||||
13:51:09.900 [ refine] POST /command/core/create-project-from-upload (523ms)
|
||||
New project: 2034248478869
|
||||
13:51:14.110 [ refine] GET /command/core/get-rows (4210ms)
|
||||
Number of rows: 75814
|
||||
STARTED ELAPSED %MEM %CPU RSS
|
||||
17:58:07 00:18 9.8 168 795024
|
||||
13:51:05 00:08 5.3 191 864692
|
||||
|
||||
=== 3. Prepare transform & export ===
|
||||
|
||||
starting time: Di 14. Mär 17:58:26 CET 2017
|
||||
starting time: Di 20. Jun 13:51:14 CEST 2017
|
||||
|
||||
get project ids...
|
||||
17:58:26.778 [ refine] GET /command/core/get-all-project-metadata (235ms)
|
||||
2073385535316: phm-collection.tsv
|
||||
13:51:14.207 [ refine] GET /command/core/get-all-project-metadata (97ms)
|
||||
2034248478869: phm-collection.tsv
|
||||
|
||||
=== 4. Transform phm-collection.tsv ===
|
||||
|
||||
starting time: Di 14. Mär 17:58:26 CET 2017
|
||||
starting time: Di 20. Jun 13:51:14 CEST 2017
|
||||
|
||||
transform phm-transform.json...
|
||||
17:58:26.917 [ refine] GET /command/core/get-models (139ms)
|
||||
17:58:26.934 [ refine] POST /command/core/apply-operations (17ms)
|
||||
13:51:14.265 [ refine] GET /command/core/get-models (58ms)
|
||||
13:51:14.273 [ refine] POST /command/core/apply-operations (8ms)
|
||||
STARTED ELAPSED %MEM %CPU RSS
|
||||
17:58:07 01:02 13.5 134 1096916
|
||||
13:51:05 00:23 7.0 155 1142712
|
||||
|
||||
|
||||
=== 5. Export phm-collection.tsv ===
|
||||
|
||||
starting time: Di 14. Mär 17:59:09 CET 2017
|
||||
starting time: Di 20. Jun 13:51:29 CEST 2017
|
||||
|
||||
export to file phm-collection.tsv...
|
||||
17:59:09.944 [ refine] GET /command/core/get-models (43010ms)
|
||||
17:59:09.956 [ refine] GET /command/core/get-all-project-metadata (12ms)
|
||||
17:59:09.967 [ refine] POST /command/core/export-rows/phm-collection.tsv.tsv (11ms)
|
||||
13:51:29.824 [ refine] GET /command/core/get-models (15551ms)
|
||||
13:51:29.827 [ refine] GET /command/core/get-all-project-metadata (3ms)
|
||||
13:51:29.841 [ refine] POST /command/core/export-rows/phm-collection.tsv.tsv (14ms)
|
||||
STARTED ELAPSED %MEM %CPU RSS
|
||||
17:58:07 02:24 13.5 60.5 1098056
|
||||
13:51:05 00:49 7.0 75.7 1144808
|
||||
|
||||
|
||||
output (number of lines / size in bytes):
|
||||
167017 60527726 /home/felix/openrefine-batch/examples/powerhouse-museum/output/phm-collection.tsv
|
||||
|
||||
cleanup...
|
||||
18:00:35.425 [ ProjectManager] Saving all modified projects ... (85458ms)
|
||||
18:00:42.357 [ project_utilities] Saved project '2073385535316' (6932ms)
|
||||
13:51:55.783 [ ProjectManager] Saving all modified projects ... (25942ms)
|
||||
13:51:58.324 [ project_utilities] Saved project '2034248478869' (2541ms)
|
||||
|
||||
=== Statistics ===
|
||||
|
||||
starting time and run time of each step:
|
||||
Start process Di 14. Mär 17:58:08 CET 2017 (00:00:00)
|
||||
Launch OpenRefine Di 14. Mär 17:58:08 CET 2017 (00:00:04)
|
||||
Import all files Di 14. Mär 17:58:12 CET 2017 (00:00:14)
|
||||
Prepare transform & export Di 14. Mär 17:58:26 CET 2017 (00:00:00)
|
||||
Transform phm-collection.tsv Di 14. Mär 17:58:26 CET 2017 (00:00:43)
|
||||
Export phm-collection.tsv Di 14. Mär 17:59:09 CET 2017 (00:01:34)
|
||||
End process Di 14. Mär 18:00:43 CET 2017 (00:00:00)
|
||||
Start process Di 20. Jun 13:51:06 CEST 2017 (00:00:00)
|
||||
Launch OpenRefine Di 20. Jun 13:51:06 CEST 2017 (00:00:03)
|
||||
Import all files Di 20. Jun 13:51:09 CEST 2017 (00:00:05)
|
||||
Prepare transform & export Di 20. Jun 13:51:14 CEST 2017 (00:00:00)
|
||||
Transform phm-collection.tsv Di 20. Jun 13:51:14 CEST 2017 (00:00:15)
|
||||
Export phm-collection.tsv Di 20. Jun 13:51:29 CEST 2017 (00:00:30)
|
||||
End process Di 20. Jun 13:51:59 CEST 2017 (00:00:00)
|
||||
|
||||
total run time: 00:02:35 (hh:mm:ss)
|
||||
highest memory load: 1072 MB
|
||||
total run time: 00:00:53 (hh:mm:ss)
|
||||
highest memory load: 1117 MB
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#!/bin/bash
|
||||
# openrefine-batch.sh, Felix Lohmeier, v1.0, 14.03.2017
|
||||
# openrefine-batch.sh, Felix Lohmeier, v1.1, 2017-06-20
|
||||
# https://github.com/felixlohmeier/openrefine-batch
|
||||
|
||||
# check system requirements
|
||||
|
@ -17,7 +17,7 @@ fi
|
|||
# help screen
|
||||
function usage () {
|
||||
cat <<EOF
|
||||
Usage: ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||
Usage: sudo ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
||||
|
||||
== basic arguments ==
|
||||
-a INPUTDIR path to directory with source files (leave empty to transform only ; multiple files may be imported into a single project by providing a zip or tar.gz archive, cf. https://github.com/OpenRefine/OpenRefine/wiki/Importers )
|
||||
|
@ -29,7 +29,7 @@ Usage: ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDI
|
|||
-f INPUTFORMAT (csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
|
||||
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
||||
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
||||
-v VERSION OpenRefine version (2.7rc2, 2.7rc1, 2.6rc2, 2.6rc1, dev; default: 2.7rc2)
|
||||
-v VERSION OpenRefine version (2.7, 2.7rc2, 2.7rc1, 2.6rc2, 2.6rc1, dev; default: 2.7)
|
||||
-E do NOT export files
|
||||
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
||||
-X do NOT restart OpenRefine after each project (e.g. input file)
|
||||
|
@ -58,13 +58,14 @@ Usage: ./openrefine-batch-docker.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDI
|
|||
|
||||
== example ==
|
||||
|
||||
./openrefine-batch-docker.sh \
|
||||
sudo ./openrefine-batch-docker.sh \
|
||||
-a examples/powerhouse-museum/input/ \
|
||||
-b examples/powerhouse-museum/config/ \
|
||||
-c examples/powerhouse-museum/output/ \
|
||||
-f tsv \
|
||||
-i processQuotes=false \
|
||||
-i guessCellValueTypes=true
|
||||
-i guessCellValueTypes=true \
|
||||
-RX
|
||||
|
||||
clone or download GitHub repository to get example data:
|
||||
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
||||
|
@ -75,7 +76,7 @@ EOF
|
|||
|
||||
# defaults
|
||||
ram="2048M"
|
||||
version="2.7rc2"
|
||||
version="2.7"
|
||||
restartfile="true"
|
||||
restarttransform="true"
|
||||
export="true"
|
||||
|
@ -160,6 +161,7 @@ echo ""
|
|||
checkpoints=${#checkpointdate[@]}
|
||||
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
||||
checkpointname[$(($checkpoints + 1))]="Start process"
|
||||
memoryload=()
|
||||
|
||||
# launch server
|
||||
checkpoints=${#checkpointdate[@]}
|
||||
|
@ -169,7 +171,7 @@ echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ==="
|
|||
echo ""
|
||||
echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})"
|
||||
echo ""
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
# wait until server is available
|
||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||
# show server logs
|
||||
|
@ -188,16 +190,17 @@ if [ -n "$inputfiles" ]; then
|
|||
for inputfile in "${inputfiles[@]}" ; do
|
||||
echo "import ${inputfile}..."
|
||||
# run client with input command
|
||||
docker run --rm --link ${uuid} -v ${inputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]}
|
||||
docker run --rm --link ${uuid} -v ${inputdir}:/data:z felixlohmeier/openrefine-client -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]}
|
||||
# show allocated system resources
|
||||
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
||||
memoryload+=($(ps --no-headers -o rss -C java))
|
||||
echo ""
|
||||
# restart server to clear memory
|
||||
if [ "$restartfile" = "true" ]; then
|
||||
echo "save project and restart OpenRefine server..."
|
||||
docker stop -t=5000 ${uuid}
|
||||
docker rm ${uuid}
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||
docker attach ${uuid} &
|
||||
echo ""
|
||||
|
@ -232,7 +235,7 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
|||
echo "restart OpenRefine server to advertise copied projects..."
|
||||
docker stop -t=5000 ${uuid}
|
||||
docker rm ${uuid}
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||
docker attach ${uuid} &
|
||||
echo ""
|
||||
|
@ -253,16 +256,17 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
|||
for jsonfile in "${jsonfiles[@]}" ; do
|
||||
echo "transform ${jsonfile}..."
|
||||
# run client with apply command
|
||||
docker run --rm --link ${uuid} -v ${configdir}:/data felixlohmeier/openrefine-client -H ${uuid} -f ${jsonfile} ${projectids[i]}
|
||||
docker run --rm --link ${uuid} -v ${configdir}:/data:z felixlohmeier/openrefine-client -H ${uuid} -f ${jsonfile} ${projectids[i]}
|
||||
# allocated system resources
|
||||
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
||||
memoryload+=($(ps --no-headers -o rss -C java))
|
||||
echo ""
|
||||
# restart server to clear memory
|
||||
if [ "$restarttransform" = "true" ]; then
|
||||
echo "save project and restart OpenRefine server..."
|
||||
docker stop -t=5000 ${uuid}
|
||||
docker rm ${uuid}
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||
docker attach ${uuid} &
|
||||
fi
|
||||
|
@ -283,9 +287,10 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
|||
filename=${projectnames[i]%.*}
|
||||
echo "export to file ${filename}.tsv..."
|
||||
# run client with export command
|
||||
docker run --rm --link ${uuid} -v ${outputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -E --output="${filename}.tsv" ${projectids[i]}
|
||||
docker run --rm --link ${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine-client -H ${uuid} -E --output="${filename}.tsv" ${projectids[i]}
|
||||
# show allocated system resources
|
||||
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
|
||||
memoryload+=($(ps --no-headers -o rss -C java))
|
||||
echo ""
|
||||
fi
|
||||
|
||||
|
@ -294,7 +299,7 @@ if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
|||
echo "restart OpenRefine server..."
|
||||
docker stop -t=5000 ${uuid}
|
||||
docker rm ${uuid}
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
docker run -d --name=${uuid} -v ${outputdir}:/data:z felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
||||
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
||||
docker attach ${uuid} &
|
||||
fi
|
||||
|
@ -337,3 +342,10 @@ done
|
|||
echo ""
|
||||
diffsec="$((${checkpointdate[$checkpoints]} - ${checkpointdate[1]}))"
|
||||
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"
|
||||
|
||||
# calculate and print memory load
|
||||
max=${memoryload[0]}
|
||||
for n in "${memoryload[@]}" ; do
|
||||
((n > max)) && max=$n
|
||||
done
|
||||
echo "highest memory load: $(($max / 1024)) MB"
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
#!/bin/bash
|
||||
# openrefine-batch.sh, Felix Lohmeier, v1.0.1, 15.03.2017
|
||||
# openrefine-batch.sh, Felix Lohmeier, v1.1, 2017-06-20
|
||||
# https://github.com/felixlohmeier/openrefine-batch
|
||||
|
||||
# declare download URLs for OpenRefine and OpenRefine client
|
||||
openrefine_URL="https://github.com/OpenRefine/OpenRefine/releases/download/2.7-rc.2/openrefine-linux-2.7-rc.2.tar.gz"
|
||||
openrefine_URL="https://github.com/OpenRefine/OpenRefine/releases/download/2.7/openrefine-linux-2.7.tar.gz"
|
||||
client_URL="https://github.com/felixlohmeier/openrefine-client/archive/v0.3.1.tar.gz"
|
||||
|
||||
# check system requirements
|
||||
|
@ -98,7 +98,8 @@ Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
|||
-c examples/powerhouse-museum/output/ \
|
||||
-f tsv \
|
||||
-i processQuotes=false \
|
||||
-i guessCellValueTypes=true
|
||||
-i guessCellValueTypes=true \
|
||||
-RX
|
||||
|
||||
clone or download GitHub repository to get example data:
|
||||
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
||||
|
@ -376,6 +377,7 @@ done
|
|||
echo ""
|
||||
diffsec="$((${checkpointdate[$checkpoints]} - ${checkpointdate[1]}))"
|
||||
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"
|
||||
|
||||
# calculate and print memory load
|
||||
max=${memoryload[0]}
|
||||
for n in "${memoryload[@]}" ; do
|
||||
|
|
Loading…
Reference in New Issue