{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test function create in module cli" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Install\n", "\n", "This notebook requires a Python 2.7 environment and an OpenRefine server running at http://127.0.0.1:3333." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support\u001b[0m\n", "Processing /home/felix/git/openrefine-client\n", "Requirement already satisfied, skipping upgrade: urllib2_file in /home/felix/.local/lib/python2.7/site-packages (from openrefine-client==0.3.7) (0.2.1)\n", "Installing collected packages: openrefine-client\n", " Found existing installation: openrefine-client 0.3.7\n", " Uninstalling openrefine-client-0.3.7:\n", " Successfully uninstalled openrefine-client-0.3.7\n", " Running setup.py install for openrefine-client ... \u001b[?25ldone\n", "\u001b[?25hSuccessfully installed openrefine-client-0.3.7\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install .. --user --upgrade" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from google.refine import cli" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1618143866116\n", "rows: 10\n", " id: 1618143866116\n", " url: http://127.0.0.1:3333/project?project=1618143866116\n", " name: duplicates\n", " modified: 2019-08-20T02:12:53Z\n", " created: 2019-08-20T02:12:53Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1618143866116 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### encoding\n", "\n", "check TV symbol in line 1" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1676755759011\n", "rows: 10\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: รฐยŸย“ยบ)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1676755759011 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', encoding='ISO-8859-1')\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1970849280401\n", "rows: 10\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1970849280401 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', encoding='UTF-8')\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### guessCellValueTypes\n", "\n", "check OpenRefine GUI at url below: numbers should be green" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2231557582225\n", "rows: 10\n", " id: 2231557582225\n", " url: http://127.0.0.1:3333/project?project=2231557582225\n", " name: duplicates\n", " modified: 2019-08-20T02:12:53Z\n", " created: 2019-08-20T02:12:53Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2231557582225 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### headerLines\n", "\n", "check column names, should be Column 1..." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2294888751269\n", "rows: 11\n", "Column 1\tColumn 2\tColumn 3\tColumn 4\tColumn 5\tColumn 6\tColumn 7\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 2294888751269 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', headerLines=0)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ignoreLines\n", "\n", "check column names, should start with arthur.duff as header" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1990694976789\n", "rows: 5\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1990694976789 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', ignoreLines=5)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### limit\n", "\n", "should contain 5 rows" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1834697810094\n", "rows: 5\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "Project 1834697810094 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', limit=5)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### separator and processQuotes\n", "\n", "should contain 10 rows and 2 columns (Column 2)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1745680810911\n", "rows: 10\n", "email,name,state,gender,purchase,count,date\tColumn 2\n", "\"danny.baron@example1.com,Danny Baron,CA,M,TV (UTF-8: ๐Ÿ“บ),1,\"\"Wed, 4 Jul 2001\"\t\n", "melanie.white@example2.edu,Melanie White,NC,F,,1,2001-07-04T12:08:56\t\n", "danny.baron@example1.com, D.\t\"(\"\"Tab\"\") Baron,CA,M,Winter jacket,1,2001-07-04\"\n", "ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight,1,2001/07/04\t\n", "arthur.duff@example4.com,Arthur Duff,OR,M,Dining table,1,2001-07\t\n", "danny.baron@example1.com,Daniel Baron,,,Bike,1,2001\t\n", "jean.griffith@example5.org,Jean Griffith,WA,F,Power drill,1,2000\t\n", "melanie.white@example2.edu,Melanie White,NC,F,'iPad',1,1999\t\n", "ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier,1,1998\t\n", "arthur.duff@example4.com,Arthur Duff,OR,M,Night table,1,1997\t\n", "Project 1745680810911 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', separator=' ', processQuotes=False)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### projectName" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2022088294800\n", "rows: 10\n", " id: 2022088294800\n", " url: http://127.0.0.1:3333/project?project=2022088294800\n", " name: foo\n", " modified: 2019-08-20T02:12:53Z\n", " created: 2019-08-20T02:12:53Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'foo', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "Project 2022088294800 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', projectName='foo')\n", "cli.info(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### projectTags (introduced in OpenRefine 2.8)\n", "\n", "check manually at http://127.0.0.1:3333 > Open Project if tags where stored" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2228120867351\n", "rows: 10\n", " id: 2228120867351\n", " url: http://127.0.0.1:3333/project?project=2228120867351\n", " name: duplicates\n", " tags: [u'client1', u'beta']\n", " modified: 2019-08-20T02:12:53Z\n", " created: 2019-08-20T02:12:53Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'projectTags': [u'client1', u'beta'], u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', projectTags=['client1', 'beta'])\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2228120867351 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### skipDataLines\n", "\n", "should contain 5 rows" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1725478809832\n", "rows: 5\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1725478809832 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', skipDataLines=5)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### storeBlankCellsAsNulls\n", "\n", "check OpenRefine GUI at url below:\n", "* All > View > Show/Hide 'null' values in cells\n", "* row 6 should contain null values in columns state and gender" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2533896794214\n", "rows: 10\n", " id: 2533896794214\n", " url: http://127.0.0.1:3333/project?project=2533896794214\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2533896794214 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2281824651803\n", "rows: 10\n", " id: 2281824651803\n", " url: http://127.0.0.1:3333/project?project=2281824651803\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.tsv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\"D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 2281824651803 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.tsv')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## JSON" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2534262116323\n", "rows: 10\n", " id: 2534262116323\n", " url: http://127.0.0.1:3333/project?project=2534262116323\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: _ - name\n", " column 002: _ - date\n", " column 003: _ - email\n", " column 004: _ - state\n", " column 005: _ - count\n", " column 006: _ - gender\n", " column 007: _ - purchase\n", "_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n", "Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: ๐Ÿ“บ)\n", "Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t\n", "\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n", "Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n", "Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n", "Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n", "Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n", "Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n", "Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n", "Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n", "Project 2534262116323 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.json')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### trimStrings (broken, does not work in the GUI either)\n", "\n", "check row 3 if spaces before `D.` are deleted" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2495073177504\n", "rows: 10\n", " id: 2495073177504\n", " url: http://127.0.0.1:3333/project?project=2495073177504\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: _ - name\n", " column 002: _ - date\n", " column 003: _ - email\n", " column 004: _ - state\n", " column 005: _ - count\n", " column 006: _ - gender\n", " column 007: _ - purchase\n", "_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n", "Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: ๐Ÿ“บ)\n", "Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t\n", "\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n", "Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n", "Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n", "Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n", "Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n", "Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n", "Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n", "Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n", "Project 2495073177504 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.json', trimStrings=True)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### recordPath" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1671966444040\n", "rows: 10\n", " id: 1671966444040\n", " url: http://127.0.0.1:3333/project?project=1671966444040\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: purchase\n", "purchase\n", "TV (UTF-8: ๐Ÿ“บ)\n", "\n", "Winter jacket\n", "Flashlight\n", "Dining table\n", "Bike\n", "Power drill\n", "'iPad'\n", "Amplifier\n", "Night table\n", "Project 1671966444040 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.json', recordPath=['_', '_', 'purchase'])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### storeEmptyStrings\n", "\n", "default: True; set to False for null values\n", "\n", "check OpenRefine GUI at url below:\n", "* All > View > Show/Hide 'null' values in cells\n", "* row 6 should contain null values in columns state and gender" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2078676878032\n", "rows: 10\n", " id: 2078676878032\n", " url: http://127.0.0.1:3333/project?project=2078676878032\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: _ - name\n", " column 002: _ - date\n", " column 003: _ - email\n", " column 004: _ - count\n", " column 005: _ - purchase\n", " column 006: _ - state\n", " column 007: _ - gender\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.json', storeEmptyStrings=False)\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2078676878032 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## XML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2264312539076\n", "rows: 80\n", " id: 2264312539076\n", " url: http://127.0.0.1:3333/project?project=2264312539076\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 80\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: root\n", " column 002: root - record\n", " column 003: root - record - name\n", " column 004: root - record - date\n", " column 005: root - record - email\n", " column 006: root - record - count\n", " column 007: root - record - purchase\n", " column 008: root - record - state\n", " column 009: root - record - gender\n", "root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n", "\"\n", " \"\t\"\n", " \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: ๐Ÿ“บ)\tCA\tM\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t\tNC\tF\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", "\"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "Project 2264312539076 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.xml')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### trimStrings (broken, does not work in the GUI either)\n", "\n", "check if spaces before `D.` are deleted" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1917953863988\n", "rows: 80\n", " id: 1917953863988\n", " url: http://127.0.0.1:3333/project?project=1917953863988\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 80\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: root\n", " column 002: root - record\n", " column 003: root - record - name\n", " column 004: root - record - date\n", " column 005: root - record - email\n", " column 006: root - record - count\n", " column 007: root - record - purchase\n", " column 008: root - record - state\n", " column 009: root - record - gender\n", "root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n", "\"\n", " \"\t\"\n", " \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: ๐Ÿ“บ)\tCA\tM\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t\tNC\tF\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", "\"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "Project 1917953863988 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.xml', trimStrings=True)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### recordPath" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2293178566671\n", "rows: 10\n", " id: 2293178566671\n", " url: http://127.0.0.1:3333/project?project=2293178566671\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root', u'record', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: purchase\n", "purchase\n", "TV (UTF-8: ๐Ÿ“บ)\n", "\n", "Winter jacket\n", "Flashlight\n", "Dining table\n", "Bike\n", "Power drill\n", "'iPad'\n", "Amplifier\n", "Night table\n", "Project 2293178566671 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.xml', recordPath=['root', 'record', 'purchase'])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### storeEmptyStrings\n", "\n", "default: True; set to False for null values\n", "\n", "check OpenRefine GUI at url below:\n", "* All > View > Show/Hide 'null' values in cells\n", "* row 6 should contain null values in columns state and gender" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2438123269695\n", "rows: 10\n", " id: 2438123269695\n", " url: http://127.0.0.1:3333/project?project=2438123269695\n", " name: duplicates\n", " modified: 2019-08-20T02:12:54Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', storeEmptyStrings=False)\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2438123269695 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TXT" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default (line-based)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1913292396645\n", "rows: 11\n", " id: 1913292396645\n", " url: http://127.0.0.1:3333/project?project=1913292396645\n", " name: duplicates\n", " modified: 2019-08-20T02:12:55Z\n", " created: 2019-08-20T02:12:54Z\n", " rowCount: 11\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n", " column 001: Column 1\n", "Column 1\n", "email name state gender purchase count date \n", "danny.baron@example1.com Danny Baron CA M TV (UTF-8: ๐Ÿ“บ) 1 Wed, 4 Jul 2001 \n", "melanie.white@example2.edu Melanie White NC F 1 2001-07-04T12:08:5\n", "\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n", "ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \n", "arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n", "danny.baron@example1.com Daniel Baron Bike 1 2001 \n", "jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n", "melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \n", "ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n", "arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \n", "Project 1913292396645 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.txt')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### linesPerRow\n", "\n", "should return 6 rows in 2 columns" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1958513543951\n", "rows: 6\n", " id: 1958513543951\n", " url: http://127.0.0.1:3333/project?project=1958513543951\n", " name: duplicates\n", " modified: 2019-08-20T02:12:55Z\n", " created: 2019-08-20T02:12:55Z\n", " rowCount: 6\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'linesPerRow': 2, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n", " column 001: Column 1\n", " column 002: Column 2\n", "Column 1\tColumn 2\n", "email name state gender purchase count date \tdanny.baron@example1.com Danny Baron CA M TV (UTF-8: ๐Ÿ“บ) 1 Wed, 4 Jul 2001 \n", "melanie.white@example2.edu Melanie White NC F 1 2001-07-04T12:08:5\t\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n", "ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \tarthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n", "danny.baron@example1.com Daniel Baron Bike 1 2001 \tjean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n", "melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \tben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n", "arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \t\n", "Project 1958513543951 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.txt', linesPerRow=2)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### fixed-width: columnWidths and headerLines" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1703842312470\n", "rows: 10\n", " id: 1703842312470\n", " url: http://127.0.0.1:3333/project?project=1703842312470\n", " name: duplicates\n", " modified: 2019-08-20T02:12:55Z\n", " created: 2019-08-20T02:12:55Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'separator': u',', u'trimStrings': False, u'columnWidths': [27, 21, 6, 7, 15, 6, 1000], u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 1}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com \tDanny Baron \tCA \tM \tTV (UTF-8: ๐Ÿ“บ) \t1 \tWed, 4 Jul 2001 \n", "melanie.white@example2.edu \tMelanie White \tNC \tF \t \t1 \t2001-07-04T12:08:5\n", "danny.baron@example1.com \t\" D.\t(\"\"Tab\"\") Baron \"\tCA \tM \tWinter jacket \t1 \t2001-07-04 \n", "ben.tyler@example3.org \tBen Tyler \tNV \tM \tFlashlight \t1 \t2001/07/04 \n", "arthur.duff@example4.com \tArthur Duff \tOR \tM \tDining table \t1 \t2001-07 \n", "danny.baron@example1.com \tDaniel Baron \t \t \tBike \t1 \t2001 \n", "jean.griffith@example5.org \tJean Griffith \tWA \tF \tPower drill \t1 \t2000 \n", "melanie.white@example2.edu \tMelanie White \tNC \tF \t'iPad' \t1 \t1999 \n", "ben.morisson@example6.org \tBen Morisson \tFL \tM \tAmplifier \t1 \t1998 \n", "arthur.duff@example4.com \tArthur Duff \tOR \tM \tNight table \t1 \t1997 \n", "Project 1703842312470 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.txt', columnWidths=[27, 21, 6, 7, 15, 6, 1000], headerLines=1)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ZIP" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default\n", "\n", "should contain 16 rows" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2381217278039\n", "rows: 16\n", " id: 2381217278039\n", " url: http://127.0.0.1:3333/project?project=2381217278039\n", " name: duplicates\n", " modified: 2019-08-20T02:12:55Z\n", " created: 2019-08-20T02:12:55Z\n", " rowCount: 16\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "Project 2381217278039 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.zip')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### includeFileSources\n", "\n", "should contain column File" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2314884555837\n", "rows: 16\n", " id: 2314884555837\n", " url: http://127.0.0.1:3333/project?project=2314884555837\n", " name: duplicates\n", " modified: 2019-08-20T02:12:55Z\n", " created: 2019-08-20T02:12:55Z\n", " rowCount: 16\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}]\n", " column 001: File\n", " column 002: email\n", " column 003: name\n", " column 004: state\n", " column 005: gender\n", " column 006: purchase\n", " column 007: count\n", " column 008: date\n", "File\temail\tname\tstate\tgender\tpurchase\tcount\tdate\n", "duplicates.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", "duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "duplicates.csv\tdanny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "duplicates.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "duplicates.csv\tdanny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "duplicates.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "duplicates.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "duplicates2.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n", "duplicates2.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n", "duplicates2.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "duplicates2.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n", "duplicates2.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "duplicates2.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "Project 2314884555837 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.zip', includeFileSources=True)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ODS (broken in OpenRefine >=2.8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default\n", "\n", "many blank columns and rows in OpenRefine <=2.7 (also with manual import via GUI)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1620818141127\n", "rows: 11\n", " id: 1620818141127\n", " url: http://127.0.0.1:3333/project?project=1620818141127\n", " name: duplicates\n", " modified: 2019-08-20T02:13:41Z\n", " created: 2019-08-20T02:13:41Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", " column 008: Column\n", " column 009: Column 9\n", " column 010: Column 10\n", " column 011: Column 11\n", " column 012: Column 12\n", " column 013: Column 13\n", " column 014: Column 14\n", " column 015: Column 15\n", " column 016: Column 16\n", " column 017: Column 17\n", " column 018: Column 18\n", " column 019: Column 19\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\tColumn\tColumn 9\tColumn 10\tColumn 11\tColumn 12\tColumn 13\tColumn 14\tColumn 15\tColumn 16\tColumn 17\tColumn 18\tColumn 19\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\t\t\t\t\t\t\t\t\t\t\t\t\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\t\t\t\t\t\t\t\t\t\t\t\t\n", "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\t\t\t\t\t\t\t\t\t\t\t\t\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\t\t\t\t\t\t\t\t\t\t\t\t\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\t\t\t\t\t\t\t\t\t\t\t\t\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "Project 1620818141127 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.ods')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sheets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "first sheet from file with 2 sheets" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1985853059017\n", "rows: 11\n", " id: 1985853059017\n", " url: http://127.0.0.1:3333/project?project=1985853059017\n", " name: duplicates2\n", " modified: 2019-08-20T02:13:47Z\n", " created: 2019-08-20T02:13:47Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", " column 008: Column\n", " column 009: Column 9\n", " column 010: Column 10\n", " column 011: Column 11\n", " column 012: Column 12\n", " column 013: Column 13\n", " column 014: Column 14\n", " column 015: Column 15\n", " column 016: Column 16\n", " column 017: Column 17\n", " column 018: Column 18\n", " column 019: Column 19\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\tColumn\tColumn 9\tColumn 10\tColumn 11\tColumn 12\tColumn 13\tColumn 14\tColumn 15\tColumn 16\tColumn 17\tColumn 18\tColumn 19\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\t\t\t\t\t\t\t\t\t\t\t\t\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\t\t\t\t\t\t\t\t\t\t\t\t\n", "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\t\t\t\t\t\t\t\t\t\t\t\t\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\t\t\t\t\t\t\t\t\t\t\t\t\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\t\t\t\t\t\t\t\t\t\t\t\t\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\t\t\t\t\t\t\t\t\t\t\t\t\n", "Project 1985853059017 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates2.ods', sheets=[0])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "both sheets from file with 2 sheets: should contain 16 rows" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2325827930833\n", "rows: 18\n", " id: 2325827930833\n", " url: http://127.0.0.1:3333/project?project=2325827930833\n", " name: duplicates2\n", " modified: 2019-08-20T02:13:49Z\n", " created: 2019-08-20T02:13:49Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", " column 008: Column\n", " column 009: Column 9\n", " column 010: Column 10\n", " column 011: Column 11\n", " column 012: Column 12\n", " column 013: Column 13\n", " column 014: Column 14\n", " column 015: Column 15\n", " column 016: Column 16\n", " column 017: Column 17\n", " column 018: Column 18\n", " column 019: Column 19\n", " column 020: Column 20\n", " column 021: Column 21\n", " column 022: Column 22\n", " column 023: Column 23\n", " column 024: Column 24\n", " column 025: Column 25\n", " column 026: Column 26\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\tColumn\tColumn 9\tColumn 10\tColumn 11\tColumn 12\tColumn 13\tColumn 14\tColumn 15\tColumn 16\tColumn 17\tColumn 18\tColumn 19\tColumn 20\tColumn 21\tColumn 22\tColumn 23\tColumn 24\tColumn 25\tColumn 26\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3.0\tWed, 4 Jul 2001, 2001-07-04, 2001\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2.0\t2001-07-04T12:08:56, 1999\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2.0\t2001-07, 1997\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", "Project 2325827930833 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates2.ods', sheets=[0, 1])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## XLS (broken in OpenRefine >=2.8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1607123650693\n", "rows: 10\n", " id: 1607123650693\n", " url: http://127.0.0.1:3333/project?project=1607123650693\n", " name: duplicates\n", " modified: 2019-08-20T02:13:52Z\n", " created: 2019-08-20T02:13:52Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D. (\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", "Project 1607123650693 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.xls')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sheets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "first sheet from file with 2 sheets" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2439816728218\n", "rows: 10\n", " id: 2439816728218\n", " url: http://127.0.0.1:3333/project?project=2439816728218\n", " name: duplicates2\n", " modified: 2019-08-20T02:13:58Z\n", " created: 2019-08-20T02:13:58Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", "Project 2439816728218 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates2.xls', sheets=[0])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "both sheets from file with 2 sheets: should contain 16 rows" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1954256360738\n", "rows: 16\n", " id: 1954256360738\n", " url: http://127.0.0.1:3333/project?project=1954256360738\n", " name: duplicates2\n", " modified: 2019-08-20T02:13:59Z\n", " created: 2019-08-20T02:13:59Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3.0\tWed, 4 Jul 2001, 2001-07-04, 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2.0\t2001-07-04T12:08:56, 1999\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2.0\t2001-07, 1997\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", "Project 1954256360738 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates2.xls', sheets=[0, 1])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## XLSX (broken in OpenRefine >=2.8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2423289296267\n", "rows: 10\n", " id: 2423289296267\n", " url: http://127.0.0.1:3333/project?project=2423289296267\n", " name: duplicates\n", " modified: 2019-08-20T02:14:01Z\n", " created: 2019-08-20T02:14:01Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D. (\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", "Project 2423289296267 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.xlsx')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sheets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "first sheet from file with 2 sheets" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1593486586431\n", "rows: 10\n", " id: 1593486586431\n", " url: http://127.0.0.1:3333/project?project=1593486586431\n", " name: duplicates2\n", " modified: 2019-08-20T02:14:04Z\n", " created: 2019-08-20T02:14:04Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", "Project 1593486586431 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates2.xlsx', sheets=[0])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "both sheets from file with 2 sheets: should contain 16 rows" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1857964669991\n", "rows: 16\n", " id: 1857964669991\n", " url: http://127.0.0.1:3333/project?project=1857964669991\n", " name: duplicates2\n", " modified: 2019-08-20T02:14:09Z\n", " created: 2019-08-20T02:14:09Z\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3.0\tWed, 4 Jul 2001, 2001-07-04, 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2.0\t2001-07-04T12:08:56, 1999\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2.0\t2001-07, 1997\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", "Project 1857964669991 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates2.xlsx', sheets=[0, 1])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.16" } }, "nbformat": 4, "nbformat_minor": 2 }