This commit is contained in:
Felix Lohmeier 2017-02-27 03:48:43 +01:00
parent 67047b45d6
commit 2e19b9ef78
2 changed files with 135 additions and 112 deletions

101
README.md
View File

@ -42,21 +42,22 @@ Windows:
clone or [download GitHub repository](https://github.com/felixlohmeier/openrefine-batch/archive/master.zip) to get example data
```
./openrefine-batch.sh examples/powerhouse-museum/input/ examples/powerhouse-museum/config/ examples/powerhouse-museum/output/ 4G tsv --processQuotes=false --guessCellValueTypes=true
./openrefine-batch.sh examples/powerhouse-museum/input/ examples/powerhouse-museum/config/ examples/powerhouse-museum/output/ examples/powerhouse-museum/cross/ 4G tsv --processQuotes=false --guessCellValueTypes=true
```
#### Options
```
./openrefine-batch.sh $inputdir $configdir $outputdir $ram $inputformat $inputoptions
./openrefine-batch.sh $inputdir $configdir $outputdir $crossdir $ram $inputformat $inputoptions
```
1. inputdir: path to directory with source files (multiple files may be imported into a single project by providing a zip or tar.gz archive)
2. configdir: path to directory with OpenRefine transformation rules (json files)
3. outputdir: path to directory for exported files (and temporary workspace)
4. ram: maximum RAM for OpenRefine java heap space (default: 4G)
5. inputformat: csv, tsv, xml, json, line-based, fixed-width, xlsx or ods
6. inputoptions: several options provided by [openrefine-client](https://hub.docker.com/r/felixlohmeier/openrefine-client/)
1. inputdir: path to directory with source files (multiple files may be imported into a single project [by providing a zip or tar.gz archive](https://github.com/OpenRefine/OpenRefine/wiki/Importers))
2. configdir: path to directory with [OpenRefine transformation rules (json files)](http://kb.refinepro.com/2012/06/google-refine-json-and-my-notepad-or.html)
3. outputdir: path to directory for exported files (and OpenRefine workspace)
4. crossdir: path to directory with additional OpenRefine projects (will be copied to workspace before transformation step to support the [cross function](https://github.com/OpenRefine/OpenRefine/wiki/GREL-Other-Functions#crosscell-c-string-projectname-string-columnname))
5. ram: maximum RAM for OpenRefine java heap space (default: 4G)
6. inputformat: csv, tsv, xml, json, line-based, fixed-width, xlsx or ods
7. inputoptions: several options provided by [openrefine-client](https://hub.docker.com/r/felixlohmeier/openrefine-client/)
inputoptions (mandatory for xml, json, fixed-width, xslx, ods):
* `--recordPath=RECORDPATH` (xml, json): please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: `--recordPath=collection --recordPath=record`
@ -83,69 +84,72 @@ more inputoptions (optional, only together with inputformat):
The script uses `docker attach` to print log messages from OpenRefine server and `ps` to show statistics for each step. Here is a sample log:
```
[00:08 felix ~/openrefine/openrefine-batch]$ ./openrefine-batch.sh examples/powerhouse-museum/input/ examples/powerhouse-museum/config/ examples/powerhouse-museum/output/ 4G tsv --processQuotes=false --guessCellValueTypes=true
Input dir: /home/felix/occcloud/Openness/Kunden+Projekte/OpenRefine/openrefine-batch/examples/powerhouse-museum/input
[03:27 felix ~/openrefine-batch (master *)]$ ./openrefine-batch.sh examples/powerhouse-museum/input/ examples/powerhouse-museum/config/ examples/powerhouse-museum/output/ examples/powerhouse-museum/cross/ 4G tsv --processQuotes=false --guessCellValueTypes=true
Input directory: /home/felix/openrefine-batch/examples/powerhouse-museum/input
Input files: phm-collection.tsv
Input format: --format=tsv
Input options: --processQuotes=false --guessCellValueTypes=true
Config directory: /home/felix/openrefine-batch/examples/powerhouse-museum/config
Transformation rules: phm-transform.json
Cross directory: /home/felix/openrefine-batch/examples/powerhouse-museum/cross
Cross projects:
OpenRefine heap space: 4G
OpenRefine version: 2.7rc1
Docker container: 41ca6232-8484-40e0-a606-3bcbf29903f6
Output directory: /home/felix/occcloud/Openness/Kunden+Projekte/OpenRefine/openrefine-batch/examples/powerhouse-museum/output
Docker container: 6b7eb36f-fc72-4040-b135-acee36948c13
Output directory: /home/felix/openrefine-batch/examples/powerhouse-museum/output
begin: Mo 27. Feb 00:08:02 CET 2017
begin: Mo 27. Feb 03:28:45 CET 2017
start OpenRefine server...
[sudo] password for felix:
fab9894d902372767cdb38d05b6e247dce722da22192d734862fc2f096a23d51
92499ecd252a8768ea5b57e0be0fb30fe6340eab67d28b1be158e0ad01f79419
import phm-collection.tsv...
New project: 1719405033732
New project: 2325849087106
Number of rows: 75814
STARTED ELAPSED %MEM %CPU RSS
00:08:13 00:29 10.0 122 813604
03:28:55 00:29 10.0 122 812208
save project and restart OpenRefine server...
23:08:46.130 [ ProjectManager] Saving all modified projects ... (4679ms)
23:08:55.190 [ project_utilities] Saved project '1719405033732' (9060ms)
41ca6232-8484-40e0-a606-3bcbf29903f6
41ca6232-8484-40e0-a606-3bcbf29903f6
6bb7ee1f1f2a1d09e191a3fadad9e26aaa89414b2c618a47d3d3ef7c040c6b1a
02:29:28.170 [ ProjectManager] Saving all modified projects ... (4594ms)
02:29:36.414 [ project_utilities] Saved project '2325849087106' (8244ms)
6b7eb36f-fc72-4040-b135-acee36948c13
6b7eb36f-fc72-4040-b135-acee36948c13
f28de26b99475c4db09dbfb9ab3d445aa8127dedd08b8e729cb6b4d65c96bf38
begin project 1719405033732 @ Mo 27. Feb 00:09:12 CET 2017
begin project 2325849087106 @ Mo 27. Feb 03:29:52 CET 2017
transform phm-transform.json...
23:09:13.747 [ refine] GET /command/core/get-models (2489ms)
23:09:16.887 [ project] Loaded project 1719405033732 from disk in 3 sec(s) (3140ms)
23:09:17.140 [ refine] POST /command/core/apply-operations (253ms)
02:29:54.372 [ refine] GET /command/core/get-models (2815ms)
02:29:57.525 [ project] Loaded project 2325849087106 from disk in 3 sec(s) (3153ms)
02:29:57.640 [ refine] POST /command/core/apply-operations (115ms)
STARTED ELAPSED %MEM %CPU RSS
00:08:57 01:10 20.1 124 1625788
03:29:38 01:07 19.6 128 1588152
save project and restart OpenRefine server...
23:10:07.930 [ ProjectManager] Saving all modified projects ... (50790ms)
23:10:15.173 [ project_utilities] Saved project '1719405033732' (7243ms)
41ca6232-8484-40e0-a606-3bcbf29903f6
41ca6232-8484-40e0-a606-3bcbf29903f6
cc9c49dcaf54c720d915a55b4e646909f657fb6582c0ac3c9f069996b9cd0b53
export to file 1719405033732.tsv...
23:10:29.972 [ refine] GET /command/core/get-models (4381ms)
23:10:33.826 [ project] Loaded project 1719405033732 from disk in 3 sec(s) (3854ms)
23:10:34.123 [ refine] GET /command/core/get-all-project-metadata (297ms)
23:10:34.140 [ refine] POST /command/core/export-rows/phm-collection.tsv.tsv (17ms)
02:30:46.280 [ ProjectManager] Saving all modified projects ... (48640ms)
02:30:53.404 [ project_utilities] Saved project '2325849087106' (7124ms)
6b7eb36f-fc72-4040-b135-acee36948c13
6b7eb36f-fc72-4040-b135-acee36948c13
186b0bda0ca542642ce1875d55f8341648e05248eb359541b80191832783f40b
export to file 2325849087106.tsv...
02:31:08.149 [ refine] GET /command/core/get-models (4039ms)
02:31:11.485 [ project] Loaded project 2325849087106 from disk in 3 sec(s) (3336ms)
02:31:11.756 [ refine] GET /command/core/get-all-project-metadata (271ms)
02:31:11.774 [ refine] POST /command/core/export-rows/phm-collection.tsv.tsv (18ms)
STARTED ELAPSED %MEM %CPU RSS
00:10:17 02:01 12.8 27.2 1041596
save project and restart OpenRefine server...
41ca6232-8484-40e0-a606-3bcbf29903f6
41ca6232-8484-40e0-a606-3bcbf29903f6
8e1febaf862c2e0bb162c6dfe968015b54f600d6b45f8d1a401b74e7285bc521
finished project 1719405033732 @ Mo 27. Feb 00:12:36 CET 2017
cleanup...
41ca6232-8484-40e0-a606-3bcbf29903f6
41ca6232-8484-40e0-a606-3bcbf29903f6
03:30:55 01:59 11.6 28.6 942900
restart OpenRefine server...
6b7eb36f-fc72-4040-b135-acee36948c13
6b7eb36f-fc72-4040-b135-acee36948c13
eb0f91675b5fbf21b4c17cceb6d93146876ea19316b7ab44af78a36f64ff1037
finished project 2325849087106 @ Mo 27. Feb 03:33:11 CET 2017
output (number of lines / size in bytes):
167017 60527726 /home/felix/occcloud/Openness/Kunden+Projekte/OpenRefine/openrefine-batch/examples/powerhouse-museum/output/1719405033732.tsv
167017 60527726 /home/felix/openrefine-batch/examples/powerhouse-museum/output/2325849087106.tsv
finish: Mo 27. Feb 00:12:42 CET 2017
cleanup...
6b7eb36f-fc72-4040-b135-acee36948c13
6b7eb36f-fc72-4040-b135-acee36948c13
finish: Mo 27. Feb 03:33:17 CET 2017
```
### Todo
@ -153,6 +157,7 @@ finish: Mo 27. Feb 00:12:42 CET 2017
- [ ] howto for installation on Mac and Windows
- [ ] howto for extracting input options from OpenRefine GUI with Firefox network monitor
- [ ] use getopts for parsing of arguments
- [ ] add option to delete openrefine projects in output directory
- [ ] provide more example data from other OpenRefine tutorials
### Licensing

View File

@ -1,23 +1,23 @@
#!/bin/bash
# openrefine-batch.sh, Felix Lohmeier, v0.1, 27.02.2017
# openrefine-batch.sh, Felix Lohmeier, v0.2, 27.02.2017
# https://github.com/felixlohmeier/openrefine-batch
# user input
if [ -z "$1" ]
then
echo 1>&2 "please provide path to directory with source files"
echo 1>&2 "please provide path to directory with source files (leave empty to transform only)"
exit 2
else
inputdir=$(readlink -f $1)
inputfiles=($(basename -a ${inputdir}/*))
inputfiles=($(find ${inputdir}/* -type f -printf "%f\n"))
fi
if [ -z "$2" ]
then
echo 1>&2 "please provide path to directory with config files"
echo 1>&2 "please provide path to directory with config files (leave empty to import only)"
exit 2
else
configdir=$(readlink -f $2)
jsonfiles=($(basename -a ${configdir}/*))
jsonfiles=($(find ${configdir}/* -type f -printf "%f\n"))
fi
if [ -z "$3" ]
then
@ -29,31 +29,42 @@ if [ -z "$3" ]
fi
if [ -z "$4" ]
then
ram="4G"
echo 1>&2 "please provide path to directory with additional OpenRefine projects for use with cross function (may be empty)"
exit 2
else
ram="$4"
crossdir=$(readlink -f $4)
crossprojects=($(find ${crossdir}/* -maxdepth 0 -type d -printf "%f\n"))
fi
if [ -z "$5" ]
then
inputformat=""
ram="4G"
else
inputformat="--format=${5}"
ram="$5"
fi
if [ -z "$6" ]
then
inputformat=""
else
inputformat="--format=${6}"
fi
if [ -z "$7" ]
then
inputoptions=""
else
inputoptions=( "$6" "$7" "$8" "$9" "${10}" "${11}" "${12}" "${13}" "${14}" "${15}" )
inputoptions=( "$7" "$8" "$9" "${10}" "${11}" "${12}" "${13}" "${14}" "${15}" "${16}" "${17}" "${18}" "${19}" "${20}" )
fi
# variables
version="2.7rc1"
uuid=$(cat /proc/sys/kernel/random/uuid)
echo "Input dir: $inputdir"
echo "Input directory: $inputdir"
echo "Input files: ${inputfiles[@]}"
echo "Input format: $inputformat"
echo "Input options: ${inputoptions[@]}"
echo "Config directory: $configdir"
echo "Transformation rules: ${jsonfiles[@]}"
echo "Cross directory: $crossdir"
echo "Cross projects: ${crossprojects[@]}"
echo "OpenRefine heap space: $ram"
echo "OpenRefine version: $version"
echo "Docker container: $uuid"
@ -70,37 +81,14 @@ sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
echo ""
# import all files
for inputfile in "${inputfiles[@]}" ; do
echo "import ${inputfile}..."
# import
sudo docker run --rm --link ${uuid} -v ${inputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]}
# show server logs
sudo docker attach ${uuid} &
# statistics
ps -o start,etime,%mem,%cpu,rss -C java
# restart server to clear memory
echo "save project and restart OpenRefine server..."
sudo docker stop -t=5000 ${uuid}
sudo docker rm ${uuid}
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
echo ""
done
# get project ids
projects=($(sudo docker run --rm --link ${uuid} felixlohmeier/openrefine-client -H ${uuid} -l | cut -c 2-14))
# loop for all projects
for projectid in "${projects[@]}" ; do
echo "begin project $projectid @ $(date)"
# apply transformation rules
for jsonfile in "${jsonfiles[@]}" ; do
echo "transform ${jsonfile}..."
if [ -n "$inputfiles" ]; then
# import all files
for inputfile in "${inputfiles[@]}" ; do
echo "import ${inputfile}..."
# import
sudo docker run --rm --link ${uuid} -v ${inputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]}
# show server logs
sudo docker attach ${uuid} &
# apply
sudo docker run --rm --link ${uuid} -v ${configdir}:/data felixlohmeier/openrefine-client -H ${uuid} -f ${jsonfile} ${projectid}
# statistics
ps -o start,etime,%mem,%cpu,rss -C java
# restart server to clear memory
@ -109,38 +97,68 @@ for projectid in "${projects[@]}" ; do
sudo docker rm ${uuid}
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
echo ""
done
# export files
echo "export to file ${projectid}.tsv..."
# show server logs
sudo docker attach ${uuid} &
# export
sudo docker run --rm --link ${uuid} -v ${outputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -E --output=${projectid}.tsv ${projectid}
# statistics
ps -o start,etime,%mem,%cpu,rss -C java
# restart server to clear memory
echo "restart OpenRefine server..."
sudo docker stop -t=5000 ${uuid}
sudo docker rm ${uuid}
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
# time
echo "finished project $projectid @ $(date)"
fi
if [ -n "$jsonfiles" ]; then
# get project ids
projects=($(sudo docker run --rm --link ${uuid} felixlohmeier/openrefine-client -H ${uuid} -l | cut -c 2-14))
# copy existing projects for use with OpenRefine cross function
if [ -n "$crossprojects" ]; then
cp -r $crossdir/*.project $outputdir/
fi
# loop for all projects
for projectid in "${projects[@]}" ; do
echo "begin project $projectid @ $(date)"
# apply transformation rules
for jsonfile in "${jsonfiles[@]}" ; do
echo "transform ${jsonfile}..."
# show server logs
sudo docker attach ${uuid} &
# apply
sudo docker run --rm --link ${uuid} -v ${configdir}:/data felixlohmeier/openrefine-client -H ${uuid} -f ${jsonfile} ${projectid}
# statistics
ps -o start,etime,%mem,%cpu,rss -C java
# restart server to clear memory
echo "save project and restart OpenRefine server..."
sudo docker stop -t=5000 ${uuid}
sudo docker rm ${uuid}
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
done
# export files
echo "export to file ${projectid}.tsv..."
# show server logs
sudo docker attach ${uuid} &
# export
sudo docker run --rm --link ${uuid} -v ${outputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -E --output=${projectid}.tsv ${projectid}
# statistics
ps -o start,etime,%mem,%cpu,rss -C java
# restart server to clear memory
echo "restart OpenRefine server..."
sudo docker stop -t=5000 ${uuid}
sudo docker rm ${uuid}
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
# time
echo "finished project $projectid @ $(date)"
echo ""
done
# list output files
echo "output (number of lines / size in bytes):"
wc -c -l ${outputdir}/*.tsv
echo ""
done
fi
# cleanup
echo "cleanup..."
sudo docker stop -t=5000 ${uuid}
sudo docker rm ${uuid}
sudo rm -r -f ${outputdir}/*.project
sudo rm -r -f ${outputdir}/workspace*.json
echo ""
# list output files
echo "output (number of lines / size in bytes):"
wc -c -l ${outputdir}/*.tsv
echo ""
# time
echo "finish: $(date)"