147 lines
4.9 KiB
Bash
Executable File
147 lines
4.9 KiB
Bash
Executable File
#!/bin/bash
|
|
# openrefine-batch.sh, Felix Lohmeier, v0.1, 27.02.2017
|
|
# https://github.com/...
|
|
|
|
# user input
|
|
if [ -z "$1" ]
|
|
then
|
|
echo 1>&2 "please provide path to directory with source files"
|
|
exit 2
|
|
else
|
|
inputdir=$(readlink -f $1)
|
|
inputfiles=($(basename -a ${inputdir}/*))
|
|
fi
|
|
if [ -z "$2" ]
|
|
then
|
|
echo 1>&2 "please provide path to directory with config files"
|
|
exit 2
|
|
else
|
|
configdir=$(readlink -f $2)
|
|
jsonfiles=($(basename -a ${configdir}/*))
|
|
fi
|
|
if [ -z "$3" ]
|
|
then
|
|
echo 1>&2 "please provide path to output directory"
|
|
exit 2
|
|
else
|
|
outputdir=$(readlink -f $3)
|
|
mkdir -p ${outputdir}
|
|
fi
|
|
if [ -z "$4" ]
|
|
then
|
|
ram="4G"
|
|
else
|
|
ram="$4"
|
|
fi
|
|
if [ -z "$5" ]
|
|
then
|
|
inputformat=""
|
|
else
|
|
inputformat="--format=${5}"
|
|
fi
|
|
if [ -z "$6" ]
|
|
then
|
|
inputoptions=""
|
|
else
|
|
inputoptions=( "$6" "$7" "$8" "$9" "${10}" "${11}" "${12}" "${13}" "${14}" "${15}" )
|
|
fi
|
|
|
|
# variables
|
|
version="2.7rc1"
|
|
uuid=$(cat /proc/sys/kernel/random/uuid)
|
|
echo "Input dir: $inputdir"
|
|
echo "Input files: ${inputfiles[@]}"
|
|
echo "Input format: $inputformat"
|
|
echo "Input options: ${inputoptions[@]}"
|
|
echo "Transformation rules: ${jsonfiles[@]}"
|
|
echo "OpenRefine heap space: $ram"
|
|
echo "OpenRefine version: $version"
|
|
echo "Docker container: $uuid"
|
|
echo "Output directory: $outputdir"
|
|
echo ""
|
|
|
|
# time
|
|
echo "begin: $(date)"
|
|
echo ""
|
|
|
|
# launch openrefine server
|
|
echo "start OpenRefine server..."
|
|
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
|
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
echo ""
|
|
|
|
# import all files
|
|
for inputfile in "${inputfiles[@]}" ; do
|
|
echo "import ${inputfile}..."
|
|
# import
|
|
sudo docker run --rm --link ${uuid} -v ${inputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -c $inputfile $inputformat ${inputoptions[@]}
|
|
# show server logs
|
|
sudo docker attach ${uuid} &
|
|
# statistics
|
|
ps -o start,etime,%mem,%cpu,rss -C java
|
|
# restart server to clear memory
|
|
echo "save project and restart OpenRefine server..."
|
|
sudo docker stop -t=5000 ${uuid}
|
|
sudo docker rm ${uuid}
|
|
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
|
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
echo ""
|
|
done
|
|
|
|
# get project ids
|
|
projects=($(sudo docker run --rm --link ${uuid} felixlohmeier/openrefine-client -H ${uuid} -l | cut -c 2-14))
|
|
|
|
# loop for all projects
|
|
for projectid in "${projects[@]}" ; do
|
|
echo "begin project $projectid @ $(date)"
|
|
# apply transformation rules
|
|
for jsonfile in "${jsonfiles[@]}" ; do
|
|
echo "transform ${jsonfile}..."
|
|
# show server logs
|
|
sudo docker attach ${uuid} &
|
|
# apply
|
|
sudo docker run --rm --link ${uuid} -v ${configdir}:/data felixlohmeier/openrefine-client -H ${uuid} -f ${jsonfile} ${projectid}
|
|
# statistics
|
|
ps -o start,etime,%mem,%cpu,rss -C java
|
|
# restart server to clear memory
|
|
echo "save project and restart OpenRefine server..."
|
|
sudo docker stop -t=5000 ${uuid}
|
|
sudo docker rm ${uuid}
|
|
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
|
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
done
|
|
# export files
|
|
echo "export to file ${projectid}.tsv..."
|
|
# show server logs
|
|
sudo docker attach ${uuid} &
|
|
# export
|
|
sudo docker run --rm --link ${uuid} -v ${outputdir}:/data felixlohmeier/openrefine-client -H ${uuid} -E --output=${projectid}.tsv ${projectid}
|
|
# statistics
|
|
ps -o start,etime,%mem,%cpu,rss -C java
|
|
# restart server to clear memory
|
|
echo "restart OpenRefine server..."
|
|
sudo docker stop -t=5000 ${uuid}
|
|
sudo docker rm ${uuid}
|
|
sudo docker run -d --name=${uuid} -v ${outputdir}:/data felixlohmeier/openrefine:${version} -i 0.0.0.0 -m ${ram} -d /data
|
|
until sudo docker run --rm --link ${uuid} --entrypoint /usr/bin/curl felixlohmeier/openrefine-client --silent -N http://${uuid}:3333 | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
# time
|
|
echo "finished project $projectid @ $(date)"
|
|
echo ""
|
|
done
|
|
|
|
# cleanup
|
|
echo "cleanup..."
|
|
sudo docker stop -t=5000 ${uuid}
|
|
sudo docker rm ${uuid}
|
|
sudo rm -r -f ${outputdir}/*.project
|
|
sudo rm -r -f ${outputdir}/workspace*.json
|
|
echo ""
|
|
|
|
# list output files
|
|
echo "output (number of lines / size in bytes):"
|
|
wc -c -l ${outputdir}/*.tsv
|
|
echo ""
|
|
|
|
# time
|
|
echo "finish: $(date)"
|