openrefine-batch/openrefine-batch.sh

234 lines
7.8 KiB
Bash
Raw Normal View History

2017-02-27 00:47:34 +01:00
#!/bin/bash
2017-03-01 20:47:48 +01:00
# openrefine-batch.sh, Felix Lohmeier, v0.6.1, 01.03.2017
2017-02-27 01:03:33 +01:00
# https://github.com/felixlohmeier/openrefine-batch
2017-02-27 00:47:34 +01:00
# user input
if [ -z "$1" ]
then
2017-02-27 03:48:43 +01:00
echo 1>&2 "please provide path to directory with source files (leave empty to transform only)"
2017-02-27 00:47:34 +01:00
exit 2
else
inputdir=$(readlink -f $1)
2017-03-01 20:47:48 +01:00
if [ ! -z "${inputdir// }" ] ; then
inputfiles=($(find -L ${inputdir}/* -type f -printf "%f\n" 2>/dev/null))
fi
2017-02-27 00:47:34 +01:00
fi
if [ -z "$2" ]
then
2017-02-27 03:48:43 +01:00
echo 1>&2 "please provide path to directory with config files (leave empty to import only)"
2017-02-27 00:47:34 +01:00
exit 2
else
configdir=$(readlink -f $2)
2017-03-01 20:47:48 +01:00
if [ ! -z "${configdir// }" ] ; then
jsonfiles=($(find -L ${configdir}/* -type f -printf "%f\n" 2>/dev/null))
fi
2017-02-27 00:47:34 +01:00
fi
if [ -z "$3" ]
then
echo 1>&2 "please provide path to output directory"
exit 2
else
2017-02-27 17:38:25 +01:00
outputdir=$(readlink -m $3)
2017-02-27 00:47:34 +01:00
mkdir -p ${outputdir}
fi
if [ -z "$4" ]
then
2017-02-27 03:48:43 +01:00
echo 1>&2 "please provide path to directory with additional OpenRefine projects for use with cross function (may be empty)"
exit 2
2017-02-27 00:47:34 +01:00
else
2017-02-27 03:48:43 +01:00
crossdir=$(readlink -f $4)
2017-03-01 20:47:48 +01:00
if [ ! -z "${crossdir// }" ] ; then
crossprojects=($(find -L ${crossdir}/* -maxdepth 0 -type d -printf "%f\n" 2>/dev/null))
fi
2017-02-27 00:47:34 +01:00
fi
if [ -z "$5" ]
then
2017-02-27 03:48:43 +01:00
ram="4G"
2017-02-27 00:47:34 +01:00
else
2017-02-27 03:48:43 +01:00
ram="$5"
2017-02-27 00:47:34 +01:00
fi
if [ -z "$6" ]
2017-02-27 03:48:43 +01:00
then
2017-02-27 17:38:25 +01:00
version="2.7rc1"
2017-02-27 03:48:43 +01:00
else
2017-02-27 17:38:25 +01:00
version="$6"
2017-02-27 03:48:43 +01:00
fi
if [ -z "$7" ]
2017-02-27 17:38:25 +01:00
then
2017-03-01 17:48:13 +01:00
restartfile="restartfile-true"
2017-02-27 17:38:25 +01:00
else
2017-03-01 17:48:13 +01:00
restartfile="$7"
2017-02-27 17:38:25 +01:00
fi
if [ -z "$8" ]
2017-02-27 18:12:06 +01:00
then
2017-03-01 17:48:13 +01:00
restarttransform="restarttransform-false"
2017-02-27 18:12:06 +01:00
else
2017-03-01 17:48:13 +01:00
restarttransform="$8"
2017-02-27 18:12:06 +01:00
fi
if [ -z "$9" ]
2017-03-01 17:48:13 +01:00
then
export="export-true"
else
export="$9"
fi
if [ -z "${10}" ]
then
inputformat=""
else
inputformat="--format=${10}"
fi
if [ -z "${11}" ]
2017-02-27 00:47:34 +01:00
then
inputoptions=""
else
2017-03-01 17:48:13 +01:00
inputoptions=( "${11}" "${12}" "${13}" "${14}" "${15}" "${16}" "${17}" "${18}" "${19}" "${20}" "${21}" "${22}" "${23}" "${24}" "${25}" )
2017-02-27 00:47:34 +01:00
fi
# variables
uuid=$(cat /proc/sys/kernel/random/uuid)
2017-03-01 17:48:13 +01:00
echo "Input directory: $inputdir"
echo "Input files: ${inputfiles[@]}"
echo "Input format: $inputformat"
echo "Input options: ${inputoptions[@]}"
echo "Config directory: $configdir"
echo "Transformation rules: ${jsonfiles[@]}"
echo "Cross directory: $crossdir"
echo "Cross projects: ${crossprojects[@]}"
echo "OpenRefine heap space: $ram"
echo "OpenRefine version: $version"
echo "OpenRefine workspace: $outputdir"
echo "Export TSV to workspace: $export"
echo "Docker container name: $uuid"
echo "restart after file: $restartfile"
echo "restart after transform: $restarttransform"
2017-02-27 00:47:34 +01:00
echo ""
# time
echo "begin: $(date)"
echo ""
2017-03-01 17:48:13 +01:00
# launch server
2017-02-27 00:47:34 +01:00
echo "start OpenRefine server..."
2017-03-01 17:48:13 +01:00
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
# wait until server is available
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
# show server logs
docker attach ${uuid} &
2017-02-27 00:47:34 +01:00
echo ""
2017-03-01 17:48:13 +01:00
# import all files
2017-02-27 03:48:43 +01:00
if [ -n "$inputfiles" ]; then
2017-03-01 17:48:13 +01:00
echo "=== IMPORT ==="
echo ""
2017-02-27 03:48:43 +01:00
for inputfile in "${inputfiles[@]}" ; do
echo "import ${inputfile}..."
2017-03-01 17:48:13 +01:00
# run client with input command
docker run --rm --link ${uuid} -v ${inputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]}
# show statistics
2017-02-27 23:31:10 +01:00
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
2017-02-27 03:48:43 +01:00
echo ""
2017-03-01 17:48:13 +01:00
# restart server to clear memory
if [ "$restartfile" = "restartfile-true" ]; then
echo "save project and restart OpenRefine server..."
docker stop -t=5000 ${uuid}
docker rm ${uuid}
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
docker attach ${uuid} &
echo ""
fi
2017-02-27 03:48:43 +01:00
done
fi
2017-02-27 00:47:34 +01:00
2017-03-01 17:48:13 +01:00
echo "=== TRANSFORM / EXPORT ==="
echo ""
2017-02-27 23:31:10 +01:00
# get project ids
2017-03-01 17:48:13 +01:00
echo "get project ids..."
projects=($(docker run --rm --link ${uuid} felixlohmeier/openrefine-client -H ${uuid} -l | tee ${outputdir}/projects.tmp | cut -c 2-14))
cat ${outputdir}/projects.tmp && rm ${outputdir}/projects.tmp
echo ""
2017-02-27 00:47:34 +01:00
2017-03-01 17:48:13 +01:00
# provide additional OpenRefine projects for cross function
2017-02-27 23:31:10 +01:00
if [ -n "$crossprojects" ]; then
2017-03-01 17:48:13 +01:00
echo "provide additional projects for cross function..."
# copy given projects to workspace
2017-02-27 23:31:10 +01:00
rsync -a --exclude='*.project/history' $crossdir/*.project $outputdir
2017-03-01 17:48:13 +01:00
# restart server to advertise copied projects
echo "restart OpenRefine server to advertise copied projects..."
docker stop -t=5000 ${uuid}
docker rm ${uuid}
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
docker attach ${uuid} &
echo ""
2017-02-27 23:31:10 +01:00
fi
# loop for all projects
for projectid in "${projects[@]}" ; do
2017-03-01 17:48:13 +01:00
# time
echo "--- begin project $projectid @ $(date) ---"
echo ""
# apply transformation rules
2017-02-27 23:31:10 +01:00
if [ -n "$jsonfiles" ]; then
2017-02-27 03:48:43 +01:00
for jsonfile in "${jsonfiles[@]}" ; do
echo "transform ${jsonfile}..."
2017-03-01 17:48:13 +01:00
# run client with apply command
docker run --rm --link ${uuid} -v ${configdir}:/data felixlohmeier/openrefine-client -H ${uuid} -f ${jsonfile} ${projectid}
# show statistics
2017-02-27 23:31:10 +01:00
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
2017-03-01 17:48:13 +01:00
# restart server to clear memory
if [ "$restarttransform" = "restarttransform-true" ]; then
2017-02-27 18:12:06 +01:00
echo "save project and restart OpenRefine server..."
2017-03-01 17:48:13 +01:00
docker stop -t=5000 ${uuid}
docker rm ${uuid}
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
docker attach ${uuid} &
2017-02-27 18:12:06 +01:00
fi
2017-03-01 17:48:13 +01:00
echo ""
2017-02-27 03:48:43 +01:00
done
2017-02-27 23:31:10 +01:00
fi
2017-03-01 17:48:13 +01:00
# export project to workspace
if [ "$export" = "export-true" ]; then
echo "export to file ${projectid}.tsv..."
# run client with export command
docker run --rm --link ${uuid} -v ${outputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -E --output=${projectid}.tsv ${projectid}
# show statistics
ps -o start,etime,%mem,%cpu,rss -C java --sort=start
# restart server to clear memory
if [ "$restartfile" = "restartfile-true" ]; then
echo "restart OpenRefine server..."
docker stop -t=5000 ${uuid}
docker rm ${uuid}
docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
until docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
docker attach ${uuid} &
fi
echo""
fi
2017-02-27 23:31:10 +01:00
# time
2017-03-01 17:48:13 +01:00
echo "--- finished project $projectid @ $(date) ---"
2017-02-27 00:47:34 +01:00
echo ""
2017-02-27 23:31:10 +01:00
done
# list output files
2017-03-01 17:48:13 +01:00
if [ "$export" = "export-true" ]; then
echo "output (number of lines / size in bytes):"
wc -c -l ${outputdir}/*.tsv
echo ""
fi
2017-02-27 00:47:34 +01:00
# cleanup
echo "cleanup..."
2017-03-01 17:48:13 +01:00
docker stop -t=5000 ${uuid}
docker rm ${uuid}
2017-02-27 23:31:10 +01:00
rm -r -f ${outputdir}/workspace*.json
2017-02-27 00:47:34 +01:00
echo ""
# time
echo "finish: $(date)"