diff --git a/google/refine/__main__.py b/google/refine/__main__.py index 1c64362..f1b21a2 100644 --- a/google/refine/__main__.py +++ b/google/refine/__main__.py @@ -145,7 +145,7 @@ group5.add_option('--projectTags', dest='projectTags', help='(all formats), please provide tags in multiple arguments, e.g. --projectTags=beta --projectTags=client1') group5.add_option('--recordPath', dest='recordPath', action='append', - help='(xml,json), please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record, default xml: record, default json: _ _') + help='(xml,json), please provide path in multiple arguments, e.g. /collection/record/ should be entered: --recordPath=collection --recordPath=record, default xml: root element, default json: _ _') group5.add_option('--separator', dest='separator', help='(csv,tsv), default csv: , default tsv: \\t') group5.add_option('--sheets', dest='sheets', diff --git a/google/refine/cli.py b/google/refine/cli.py index 106a3d2..4a48ea7 100644 --- a/google/refine/cli.py +++ b/google/refine/cli.py @@ -25,6 +25,7 @@ import ssl import sys import time import urllib +from xml.etree import ElementTree from google.refine import refine @@ -43,7 +44,6 @@ def apply(project_id, history_file): def create(project_file, project_format=None, - project_name=None, columnWidths=None, encoding=None, guessCellValueTypes=False, @@ -54,6 +54,7 @@ def create(project_file, linesPerRow=None, processQuotes=True, projectName=None, + projectTags=None, recordPath=None, separator=None, sheets=None, @@ -69,15 +70,15 @@ def create(project_file, project_format = os.path.splitext(project_file)[1][1:].lower() if project_format == 'txt': try: - columnWidths + columnWidths[0] project_format = 'fixed-width' - except NameError: + except TypeError: project_format = 'line-based' # defaults for each file type if project_format == 'xml': project_format = 'text/xml' if not recordPath: - recordPath = 'record' + recordPath = [ElementTree.parse(project_file).getroot().tag] elif project_format == 'csv': project_format = 'text/line-based/*sv' elif project_format == 'tsv': @@ -95,22 +96,35 @@ def create(project_file, elif project_format == 'json': project_format = 'text/json' if not recordPath: - recordPath = ('_', '_') + recordPath = ['_', '_'] elif project_format == 'xls': project_format = 'binary/text/xml/xls/xlsx' if not sheets: - sheets = 0 + sheets = [0] + # TODO: new format for sheets option introduced in OpenRefine 2.8 elif project_format == 'xlsx': project_format = 'binary/text/xml/xls/xlsx' if not sheets: - sheets = 0 + sheets = [0] + # TODO: new format for sheets option introduced in OpenRefine 2.8 elif project_format == 'ods': project_format = 'text/xml/ods' if not sheets: - sheets = 0 + sheets = [0] + # TODO: new format for sheets option introduced in OpenRefine 2.8 # execute kwargs = {k: v for k, v in vars().items() if v is not None} - project = refine.Refine(refine.RefineServer()).new_project(**kwargs) + project = refine.Refine(refine.RefineServer()).new_project( + guess_cell_value_types=guessCellValueTypes, + ignore_lines=ignoreLines, + header_lines=headerLines, + skip_data_lines=skipDataLines, + store_blank_rows=storeBlankRows, + process_quotes=processQuotes, + project_name=projectName, + store_blank_cells_as_nulls=storeBlankCellsAsNulls, + include_file_sources=includeFileSources, + **kwargs) rows = project.do_json('get-rows')['total'] if rows > 0: print('{0}: {1}'.format('id', project.project_id)) diff --git a/google/refine/refine.py b/google/refine/refine.py index 33dcb11..b85e533 100644 --- a/google/refine/refine.py +++ b/google/refine/refine.py @@ -147,41 +147,127 @@ class Refine: """Open a Refine project.""" return RefineProject(self.server, project_id) - def new_project(self, project_file=None, project_name=None, - project_format='text/line-based/*sv', **kwargs): - """Create a Refine project.""" + # These aren't used yet but are included for reference + new_project_defaults = { + 'text/line-based/*sv': { + 'encoding': '', + 'separator': ',', + 'ignore_lines': -1, + 'header_lines': 1, + 'skip_data_lines': 0, + 'limit': -1, + 'store_blank_rows': True, + 'guess_cell_value_types': True, + 'process_quotes': True, + 'store_blank_cells_as_nulls': True, + 'include_file_sources': False}, + 'text/line-based': { + 'encoding': '', + 'lines_per_row': 1, + 'ignore_lines': -1, + 'limit': -1, + 'skip_data_lines': -1, + 'store_blank_rows': True, + 'store_blank_cells_as_nulls': True, + 'include_file_sources': False}, + 'text/line-based/fixed-width': { + 'encoding': '', + 'column_widths': [20], + 'ignore_lines': -1, + 'header_lines': 0, + 'skip_data_lines': 0, + 'limit': -1, + 'guess_cell_value_types': False, + 'store_blank_rows': True, + 'store_blank_cells_as_nulls': True, + 'include_file_sources': False}, + 'text/line-based/pc-axis': { + 'encoding': '', + 'limit': -1, + 'skip_data_lines': -1, + 'include_file_sources': False}, + 'text/rdf+n3': {'encoding': ''}, + 'text/xml/ods': { + 'sheets': [], + 'ignore_lines': -1, + 'header_lines': 1, + 'skip_data_lines': 0, + 'limit': -1, + 'store_blank_rows': True, + 'store_blank_cells_as_nulls': True, + 'include_file_sources': False}, + 'binary/xls': { + 'xml_based': False, + 'sheets': [], + 'ignore_lines': -1, + 'header_lines': 1, + 'skip_data_lines': 0, + 'limit': -1, + 'store_blank_rows': True, + 'store_blank_cells_as_nulls': True, + 'include_file_sources': False} + } - defaults = {'guessCellValueTypes': False, - 'headerLines': 1, - 'ignoreLines': -1, - 'includeFileSources': False, - 'limit': -1, - 'linesPerRow': 1, - 'processQuotes': True, - 'separator': ',', - 'skipDataLines': 0, - 'storeBlankCellsAsNulls': True, - 'storeBlankRows': True, - 'storeEmptyStrings': True, - 'trimStrings': False} + def new_project(self, project_file=None, project_url=None, project_name=None, project_format='text/line-based/*sv', + encoding='', + separator=',', + ignore_lines=-1, + header_lines=1, + skip_data_lines=0, + limit=-1, + store_blank_rows=True, + guess_cell_value_types=False, + process_quotes=True, + store_blank_cells_as_nulls=True, + include_file_sources=False, + **opts): - # options - options = {'format': project_format} - if project_file is not None: - options['project-file'] = {'fd': open(project_file), - 'filename': project_file} + if (project_file and project_url) or (not project_file and not project_url): + raise ValueError('One (only) of project_file and project_url must be set') + + def s(opt): + if isinstance(opt, bool): + return 'true' if opt else 'false' + if opt is None: + return '' + return str(opt) + + # the new APIs requires a json in the 'option' POST or GET argument + # POST is broken at the moment, so we send it in the URL + new_style_options = dict(opts, **{ + 'encoding': s(encoding), + }) + params = { + 'options': json.dumps(new_style_options), + } + + # old style options + options = { + 'format': project_format, + 'separator': s(separator), + 'ignore-lines': s(ignore_lines), + 'header-lines': s(header_lines), + 'skip-data-lines': s(skip_data_lines), + 'limit': s(limit), + 'guess-value-type': s(guess_cell_value_types), + 'process-quotes': s(process_quotes), + 'store-blank-rows': s(store_blank_rows), + 'store-blank-cells-as-nulls': s(store_blank_cells_as_nulls), + 'include-file-sources': s(include_file_sources), + } + + if project_url is not None: + options['url'] = project_url + elif project_file is not None: + options['project-file'] = { + 'fd': open(project_file), + 'filename': project_file, + } if project_name is None: # make a name for itself by stripping extension and directories project_name = (project_file or 'New project').rsplit('.', 1)[0] project_name = os.path.basename(project_name) options['project-name'] = project_name - - # params - params_dict = dict(defaults) - params_dict.update(kwargs) - params = {'options': json.dumps(params_dict)} - - # submit response = self.server.urlopen( 'create-project-from-upload', options, params ) diff --git a/tests/cli_create.ipynb b/tests/cli_create.ipynb new file mode 100644 index 0000000..9f7116b --- /dev/null +++ b/tests/cli_create.ipynb @@ -0,0 +1,2383 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test function create in module cli" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install\n", + "\n", + "This notebook requires a Python 2.7 environment and an OpenRefine server running at http://127.0.0.1:3333." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support\u001b[0m\n", + "Processing /home/felix/git/openrefine-client\n", + "Requirement already satisfied, skipping upgrade: urllib2_file in /home/felix/.local/lib/python2.7/site-packages (from openrefine-client==0.3.7) (0.2.1)\n", + "Installing collected packages: openrefine-client\n", + " Found existing installation: openrefine-client 0.3.7\n", + " Uninstalling openrefine-client-0.3.7:\n", + " Successfully uninstalled openrefine-client-0.3.7\n", + " Running setup.py install for openrefine-client ... \u001b[?25ldone\n", + "\u001b[?25hSuccessfully installed openrefine-client-0.3.7\n" + ] + } + ], + "source": [ + "import sys\n", + "!{sys.executable} -m pip install .. --user --upgrade" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from google.refine import cli" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CSV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1618143866116\n", + "rows: 10\n", + " id: 1618143866116\n", + " url: http://127.0.0.1:3333/project?project=1618143866116\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:53Z\n", + " created: 2019-08-20T02:12:53Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "Project 1618143866116 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### encoding\n", + "\n", + "check TV symbol in line 1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1676755759011\n", + "rows: 10\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: รฐยŸย“ยบ)\t1\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "Project 1676755759011 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', encoding='ISO-8859-1')\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1970849280401\n", + "rows: 10\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "Project 1970849280401 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', encoding='UTF-8')\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### guessCellValueTypes\n", + "\n", + "check OpenRefine GUI at url below: numbers should be green" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2231557582225\n", + "rows: 10\n", + " id: 2231557582225\n", + " url: http://127.0.0.1:3333/project?project=2231557582225\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:53Z\n", + " created: 2019-08-20T02:12:53Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n", + "cli.info(p.project_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project 2231557582225 has been successfully deleted\n" + ] + } + ], + "source": [ + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### headerLines\n", + "\n", + "check column names, should be Column 1..." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2294888751269\n", + "rows: 11\n", + "Column 1\tColumn 2\tColumn 3\tColumn 4\tColumn 5\tColumn 6\tColumn 7\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "Project 2294888751269 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', headerLines=0)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ignoreLines\n", + "\n", + "check column names, should start with arthur.duff as header" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1990694976789\n", + "rows: 5\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "Project 1990694976789 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', ignoreLines=5)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### limit\n", + "\n", + "should contain 5 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1834697810094\n", + "rows: 5\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "Project 1834697810094 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', limit=5)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### separator and processQuotes\n", + "\n", + "should contain 10 rows and 2 columns (Column 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1745680810911\n", + "rows: 10\n", + "email,name,state,gender,purchase,count,date\tColumn 2\n", + "\"danny.baron@example1.com,Danny Baron,CA,M,TV (UTF-8: ๐Ÿ“บ),1,\"\"Wed, 4 Jul 2001\"\t\n", + "melanie.white@example2.edu,Melanie White,NC,F,,1,2001-07-04T12:08:56\t\n", + "danny.baron@example1.com, D.\t\"(\"\"Tab\"\") Baron,CA,M,Winter jacket,1,2001-07-04\"\n", + "ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight,1,2001/07/04\t\n", + "arthur.duff@example4.com,Arthur Duff,OR,M,Dining table,1,2001-07\t\n", + "danny.baron@example1.com,Daniel Baron,,,Bike,1,2001\t\n", + "jean.griffith@example5.org,Jean Griffith,WA,F,Power drill,1,2000\t\n", + "melanie.white@example2.edu,Melanie White,NC,F,'iPad',1,1999\t\n", + "ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier,1,1998\t\n", + "arthur.duff@example4.com,Arthur Duff,OR,M,Night table,1,1997\t\n", + "Project 1745680810911 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', separator=' ', processQuotes=False)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### projectName" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2022088294800\n", + "rows: 10\n", + " id: 2022088294800\n", + " url: http://127.0.0.1:3333/project?project=2022088294800\n", + " name: foo\n", + " modified: 2019-08-20T02:12:53Z\n", + " created: 2019-08-20T02:12:53Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'foo', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "Project 2022088294800 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', projectName='foo')\n", + "cli.info(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### projectTags (introduced in OpenRefine 2.8)\n", + "\n", + "check manually at http://127.0.0.1:3333 > Open Project if tags where stored" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2228120867351\n", + "rows: 10\n", + " id: 2228120867351\n", + " url: http://127.0.0.1:3333/project?project=2228120867351\n", + " name: duplicates\n", + " tags: [u'client1', u'beta']\n", + " modified: 2019-08-20T02:12:53Z\n", + " created: 2019-08-20T02:12:53Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'projectTags': [u'client1', u'beta'], u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', projectTags=['client1', 'beta'])\n", + "cli.info(p.project_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project 2228120867351 has been successfully deleted\n" + ] + } + ], + "source": [ + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### skipDataLines\n", + "\n", + "should contain 5 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1725478809832\n", + "rows: 5\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "Project 1725478809832 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', skipDataLines=5)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### storeBlankCellsAsNulls\n", + "\n", + "check OpenRefine GUI at url below:\n", + "* All > View > Show/Hide 'null' values in cells\n", + "* row 6 should contain null values in columns state and gender" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2533896794214\n", + "rows: 10\n", + " id: 2533896794214\n", + " url: http://127.0.0.1:3333/project?project=2533896794214\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n", + "cli.info(p.project_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project 2533896794214 has been successfully deleted\n" + ] + } + ], + "source": [ + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TSV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2281824651803\n", + "rows: 10\n", + " id: 2281824651803\n", + " url: http://127.0.0.1:3333/project?project=2281824651803\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.tsv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\"D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "Project 2281824651803 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.tsv')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## JSON" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2534262116323\n", + "rows: 10\n", + " id: 2534262116323\n", + " url: http://127.0.0.1:3333/project?project=2534262116323\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: _ - name\n", + " column 002: _ - date\n", + " column 003: _ - email\n", + " column 004: _ - state\n", + " column 005: _ - count\n", + " column 006: _ - gender\n", + " column 007: _ - purchase\n", + "_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n", + "Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: ๐Ÿ“บ)\n", + "Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t\n", + "\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n", + "Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n", + "Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n", + "Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n", + "Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n", + "Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n", + "Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n", + "Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n", + "Project 2534262116323 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.json')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### trimStrings (broken, does not work in the GUI either)\n", + "\n", + "check row 3 if spaces before `D.` are deleted" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2495073177504\n", + "rows: 10\n", + " id: 2495073177504\n", + " url: http://127.0.0.1:3333/project?project=2495073177504\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: _ - name\n", + " column 002: _ - date\n", + " column 003: _ - email\n", + " column 004: _ - state\n", + " column 005: _ - count\n", + " column 006: _ - gender\n", + " column 007: _ - purchase\n", + "_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n", + "Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: ๐Ÿ“บ)\n", + "Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t\n", + "\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n", + "Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n", + "Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n", + "Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n", + "Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n", + "Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n", + "Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n", + "Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n", + "Project 2495073177504 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.json', trimStrings=True)\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### recordPath" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1671966444040\n", + "rows: 10\n", + " id: 1671966444040\n", + " url: http://127.0.0.1:3333/project?project=1671966444040\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: purchase\n", + "purchase\n", + "TV (UTF-8: ๐Ÿ“บ)\n", + "\n", + "Winter jacket\n", + "Flashlight\n", + "Dining table\n", + "Bike\n", + "Power drill\n", + "'iPad'\n", + "Amplifier\n", + "Night table\n", + "Project 1671966444040 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.json', recordPath=['_', '_', 'purchase'])\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### storeEmptyStrings\n", + "\n", + "default: True; set to False for null values\n", + "\n", + "check OpenRefine GUI at url below:\n", + "* All > View > Show/Hide 'null' values in cells\n", + "* row 6 should contain null values in columns state and gender" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2078676878032\n", + "rows: 10\n", + " id: 2078676878032\n", + " url: http://127.0.0.1:3333/project?project=2078676878032\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: _ - name\n", + " column 002: _ - date\n", + " column 003: _ - email\n", + " column 004: _ - count\n", + " column 005: _ - purchase\n", + " column 006: _ - state\n", + " column 007: _ - gender\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.json', storeEmptyStrings=False)\n", + "cli.info(p.project_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project 2078676878032 has been successfully deleted\n" + ] + } + ], + "source": [ + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2264312539076\n", + "rows: 80\n", + " id: 2264312539076\n", + " url: http://127.0.0.1:3333/project?project=2264312539076\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 80\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: root\n", + " column 002: root - record\n", + " column 003: root - record - name\n", + " column 004: root - record - date\n", + " column 005: root - record - email\n", + " column 006: root - record - count\n", + " column 007: root - record - purchase\n", + " column 008: root - record - state\n", + " column 009: root - record - gender\n", + "root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n", + "\"\n", + " \"\t\"\n", + " \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: ๐Ÿ“บ)\tCA\tM\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t\tNC\tF\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + "\"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "Project 2264312539076 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.xml')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### trimStrings (broken, does not work in the GUI either)\n", + "\n", + "check if spaces before `D.` are deleted" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1917953863988\n", + "rows: 80\n", + " id: 1917953863988\n", + " url: http://127.0.0.1:3333/project?project=1917953863988\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 80\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: root\n", + " column 002: root - record\n", + " column 003: root - record - name\n", + " column 004: root - record - date\n", + " column 005: root - record - email\n", + " column 006: root - record - count\n", + " column 007: root - record - purchase\n", + " column 008: root - record - state\n", + " column 009: root - record - gender\n", + "root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n", + "\"\n", + " \"\t\"\n", + " \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: ๐Ÿ“บ)\tCA\tM\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + " \"\t\"\n", + " \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t\tNC\tF\n", + "\"\n", + " \"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\"\n", + "\"\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "\t\"\n", + " \"\t\t\t\t\t\t\t\n", + "Project 1917953863988 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.xml', trimStrings=True)\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### recordPath" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2293178566671\n", + "rows: 10\n", + " id: 2293178566671\n", + " url: http://127.0.0.1:3333/project?project=2293178566671\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root', u'record', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: purchase\n", + "purchase\n", + "TV (UTF-8: ๐Ÿ“บ)\n", + "\n", + "Winter jacket\n", + "Flashlight\n", + "Dining table\n", + "Bike\n", + "Power drill\n", + "'iPad'\n", + "Amplifier\n", + "Night table\n", + "Project 2293178566671 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.xml', recordPath=['root', 'record', 'purchase'])\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### storeEmptyStrings\n", + "\n", + "default: True; set to False for null values\n", + "\n", + "check OpenRefine GUI at url below:\n", + "* All > View > Show/Hide 'null' values in cells\n", + "* row 6 should contain null values in columns state and gender" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2438123269695\n", + "rows: 10\n", + " id: 2438123269695\n", + " url: http://127.0.0.1:3333/project?project=2438123269695\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:54Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.csv', storeEmptyStrings=False)\n", + "cli.info(p.project_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project 2438123269695 has been successfully deleted\n" + ] + } + ], + "source": [ + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TXT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default (line-based)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1913292396645\n", + "rows: 11\n", + " id: 1913292396645\n", + " url: http://127.0.0.1:3333/project?project=1913292396645\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:55Z\n", + " created: 2019-08-20T02:12:54Z\n", + " rowCount: 11\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n", + " column 001: Column 1\n", + "Column 1\n", + "email name state gender purchase count date \n", + "danny.baron@example1.com Danny Baron CA M TV (UTF-8: ๐Ÿ“บ) 1 Wed, 4 Jul 2001 \n", + "melanie.white@example2.edu Melanie White NC F 1 2001-07-04T12:08:5\n", + "\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n", + "ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \n", + "arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n", + "danny.baron@example1.com Daniel Baron Bike 1 2001 \n", + "jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n", + "melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \n", + "ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n", + "arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \n", + "Project 1913292396645 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.txt')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### linesPerRow\n", + "\n", + "should return 6 rows in 2 columns" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1958513543951\n", + "rows: 6\n", + " id: 1958513543951\n", + " url: http://127.0.0.1:3333/project?project=1958513543951\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:55Z\n", + " created: 2019-08-20T02:12:55Z\n", + " rowCount: 6\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'linesPerRow': 2, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n", + " column 001: Column 1\n", + " column 002: Column 2\n", + "Column 1\tColumn 2\n", + "email name state gender purchase count date \tdanny.baron@example1.com Danny Baron CA M TV (UTF-8: ๐Ÿ“บ) 1 Wed, 4 Jul 2001 \n", + "melanie.white@example2.edu Melanie White NC F 1 2001-07-04T12:08:5\t\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n", + "ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \tarthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n", + "danny.baron@example1.com Daniel Baron Bike 1 2001 \tjean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n", + "melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \tben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n", + "arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \t\n", + "Project 1958513543951 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.txt', linesPerRow=2)\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### fixed-width: columnWidths and headerLines" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1703842312470\n", + "rows: 10\n", + " id: 1703842312470\n", + " url: http://127.0.0.1:3333/project?project=1703842312470\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:55Z\n", + " created: 2019-08-20T02:12:55Z\n", + " rowCount: 10\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'separator': u',', u'trimStrings': False, u'columnWidths': [27, 21, 6, 7, 15, 6, 1000], u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 1}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com \tDanny Baron \tCA \tM \tTV (UTF-8: ๐Ÿ“บ) \t1 \tWed, 4 Jul 2001 \n", + "melanie.white@example2.edu \tMelanie White \tNC \tF \t \t1 \t2001-07-04T12:08:5\n", + "danny.baron@example1.com \t\" D.\t(\"\"Tab\"\") Baron \"\tCA \tM \tWinter jacket \t1 \t2001-07-04 \n", + "ben.tyler@example3.org \tBen Tyler \tNV \tM \tFlashlight \t1 \t2001/07/04 \n", + "arthur.duff@example4.com \tArthur Duff \tOR \tM \tDining table \t1 \t2001-07 \n", + "danny.baron@example1.com \tDaniel Baron \t \t \tBike \t1 \t2001 \n", + "jean.griffith@example5.org \tJean Griffith \tWA \tF \tPower drill \t1 \t2000 \n", + "melanie.white@example2.edu \tMelanie White \tNC \tF \t'iPad' \t1 \t1999 \n", + "ben.morisson@example6.org \tBen Morisson \tFL \tM \tAmplifier \t1 \t1998 \n", + "arthur.duff@example4.com \tArthur Duff \tOR \tM \tNight table \t1 \t1997 \n", + "Project 1703842312470 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.txt', columnWidths=[27, 21, 6, 7, 15, 6, 1000], headerLines=1)\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ZIP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default\n", + "\n", + "should contain 16 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2381217278039\n", + "rows: 16\n", + " id: 2381217278039\n", + " url: http://127.0.0.1:3333/project?project=2381217278039\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:55Z\n", + " created: 2019-08-20T02:12:55Z\n", + " rowCount: 16\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "Project 2381217278039 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.zip')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### includeFileSources\n", + "\n", + "should contain column File" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2314884555837\n", + "rows: 16\n", + " id: 2314884555837\n", + " url: http://127.0.0.1:3333/project?project=2314884555837\n", + " name: duplicates\n", + " modified: 2019-08-20T02:12:55Z\n", + " created: 2019-08-20T02:12:55Z\n", + " rowCount: 16\n", + "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}]\n", + " column 001: File\n", + " column 002: email\n", + " column 003: name\n", + " column 004: state\n", + " column 005: gender\n", + " column 006: purchase\n", + " column 007: count\n", + " column 008: date\n", + "File\temail\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "duplicates.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1\tWed, 4 Jul 2001\n", + "duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", + "duplicates.csv\tdanny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", + "duplicates.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", + "duplicates.csv\tdanny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", + "duplicates.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", + "duplicates.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", + "duplicates2.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n", + "duplicates2.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n", + "duplicates2.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", + "duplicates2.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n", + "duplicates2.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", + "duplicates2.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", + "Project 2314884555837 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.zip', includeFileSources=True)\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ODS (broken in OpenRefine >=2.8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default\n", + "\n", + "many blank columns and rows in OpenRefine <=2.7 (also with manual import via GUI)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1620818141127\n", + "rows: 11\n", + " id: 1620818141127\n", + " url: http://127.0.0.1:3333/project?project=1620818141127\n", + " name: duplicates\n", + " modified: 2019-08-20T02:13:41Z\n", + " created: 2019-08-20T02:13:41Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + " column 008: Column\n", + " column 009: Column 9\n", + " column 010: Column 10\n", + " column 011: Column 11\n", + " column 012: Column 12\n", + " column 013: Column 13\n", + " column 014: Column 14\n", + " column 015: Column 15\n", + " column 016: Column 16\n", + " column 017: Column 17\n", + " column 018: Column 18\n", + " column 019: Column 19\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\tColumn\tColumn 9\tColumn 10\tColumn 11\tColumn 12\tColumn 13\tColumn 14\tColumn 15\tColumn 16\tColumn 17\tColumn 18\tColumn 19\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\t\t\t\t\t\t\t\t\t\t\t\t\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\t\t\t\t\t\t\t\t\t\t\t\t\n", + "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\t\t\t\t\t\t\t\t\t\t\t\t\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\t\t\t\t\t\t\t\t\t\t\t\t\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\t\t\t\t\t\t\t\t\t\t\t\t\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "Project 1620818141127 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.ods')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### sheets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "first sheet from file with 2 sheets" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1985853059017\n", + "rows: 11\n", + " id: 1985853059017\n", + " url: http://127.0.0.1:3333/project?project=1985853059017\n", + " name: duplicates2\n", + " modified: 2019-08-20T02:13:47Z\n", + " created: 2019-08-20T02:13:47Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + " column 008: Column\n", + " column 009: Column 9\n", + " column 010: Column 10\n", + " column 011: Column 11\n", + " column 012: Column 12\n", + " column 013: Column 13\n", + " column 014: Column 14\n", + " column 015: Column 15\n", + " column 016: Column 16\n", + " column 017: Column 17\n", + " column 018: Column 18\n", + " column 019: Column 19\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\tColumn\tColumn 9\tColumn 10\tColumn 11\tColumn 12\tColumn 13\tColumn 14\tColumn 15\tColumn 16\tColumn 17\tColumn 18\tColumn 19\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\t\t\t\t\t\t\t\t\t\t\t\t\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\t\t\t\t\t\t\t\t\t\t\t\t\n", + "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\t\t\t\t\t\t\t\t\t\t\t\t\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\t\t\t\t\t\t\t\t\t\t\t\t\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\t\t\t\t\t\t\t\t\t\t\t\t\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\t\t\t\t\t\t\t\t\t\t\t\t\n", + "Project 1985853059017 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates2.ods', sheets=[0])\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "both sheets from file with 2 sheets: should contain 16 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2325827930833\n", + "rows: 18\n", + " id: 2325827930833\n", + " url: http://127.0.0.1:3333/project?project=2325827930833\n", + " name: duplicates2\n", + " modified: 2019-08-20T02:13:49Z\n", + " created: 2019-08-20T02:13:49Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + " column 008: Column\n", + " column 009: Column 9\n", + " column 010: Column 10\n", + " column 011: Column 11\n", + " column 012: Column 12\n", + " column 013: Column 13\n", + " column 014: Column 14\n", + " column 015: Column 15\n", + " column 016: Column 16\n", + " column 017: Column 17\n", + " column 018: Column 18\n", + " column 019: Column 19\n", + " column 020: Column 20\n", + " column 021: Column 21\n", + " column 022: Column 22\n", + " column 023: Column 23\n", + " column 024: Column 24\n", + " column 025: Column 25\n", + " column 026: Column 26\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\tColumn\tColumn 9\tColumn 10\tColumn 11\tColumn 12\tColumn 13\tColumn 14\tColumn 15\tColumn 16\tColumn 17\tColumn 18\tColumn 19\tColumn 20\tColumn 21\tColumn 22\tColumn 23\tColumn 24\tColumn 25\tColumn 26\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3.0\tWed, 4 Jul 2001, 2001-07-04, 2001\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2.0\t2001-07-04T12:08:56, 1999\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2.0\t2001-07, 1997\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n", + "Project 2325827930833 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates2.ods', sheets=[0, 1])\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XLS (broken in OpenRefine >=2.8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1607123650693\n", + "rows: 10\n", + " id: 1607123650693\n", + " url: http://127.0.0.1:3333/project?project=1607123650693\n", + " name: duplicates\n", + " modified: 2019-08-20T02:13:52Z\n", + " created: 2019-08-20T02:13:52Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D. (\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", + "Project 1607123650693 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.xls')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### sheets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "first sheet from file with 2 sheets" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2439816728218\n", + "rows: 10\n", + " id: 2439816728218\n", + " url: http://127.0.0.1:3333/project?project=2439816728218\n", + " name: duplicates2\n", + " modified: 2019-08-20T02:13:58Z\n", + " created: 2019-08-20T02:13:58Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", + "Project 2439816728218 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates2.xls', sheets=[0])\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "both sheets from file with 2 sheets: should contain 16 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1954256360738\n", + "rows: 16\n", + " id: 1954256360738\n", + " url: http://127.0.0.1:3333/project?project=1954256360738\n", + " name: duplicates2\n", + " modified: 2019-08-20T02:13:59Z\n", + " created: 2019-08-20T02:13:59Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3.0\tWed, 4 Jul 2001, 2001-07-04, 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2.0\t2001-07-04T12:08:56, 1999\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2.0\t2001-07, 1997\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", + "Project 1954256360738 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates2.xls', sheets=[0, 1])\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XLSX (broken in OpenRefine >=2.8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### default" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 2423289296267\n", + "rows: 10\n", + " id: 2423289296267\n", + " url: http://127.0.0.1:3333/project?project=2423289296267\n", + " name: duplicates\n", + " modified: 2019-08-20T02:14:01Z\n", + " created: 2019-08-20T02:14:01Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D. (\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", + "Project 2423289296267 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates.xlsx')\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### sheets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "first sheet from file with 2 sheets" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1593486586431\n", + "rows: 10\n", + " id: 1593486586431\n", + " url: http://127.0.0.1:3333/project?project=1593486586431\n", + " name: duplicates2\n", + " modified: 2019-08-20T02:14:04Z\n", + " created: 2019-08-20T02:14:04Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", + "Project 1593486586431 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates2.xlsx', sheets=[0])\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "both sheets from file with 2 sheets: should contain 16 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 1857964669991\n", + "rows: 16\n", + " id: 1857964669991\n", + " url: http://127.0.0.1:3333/project?project=1857964669991\n", + " name: duplicates2\n", + " modified: 2019-08-20T02:14:09Z\n", + " created: 2019-08-20T02:14:09Z\n", + " column 001: email\n", + " column 002: name\n", + " column 003: state\n", + " column 004: gender\n", + " column 005: purchase\n", + " column 006: count\n", + " column 007: date\n", + "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ)\t1.0\tWed, 4 Jul 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1.0\t2001-07-04T12:08:56\n", + "danny.baron@example1.com\t\" D.(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1.0\t2001-07-04\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1.0\t2001-07\n", + "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1.0\t2001.0\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1.0\t1999.0\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1.0\t1997.0\n", + "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: ๐Ÿ“บ), Winter jacket, bike\t3.0\tWed, 4 Jul 2001, 2001-07-04, 2001\n", + "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2.0\t2001-07-04T12:08:56, 1999\n", + "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1.0\t2001/07/04\n", + "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2.0\t2001-07, 1997\n", + "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1.0\t2000.0\n", + "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1.0\t1998.0\n", + "Project 1857964669991 has been successfully deleted\n" + ] + } + ], + "source": [ + "p = cli.create('data/cli/duplicates2.xlsx', sheets=[0, 1])\n", + "cli.info(p.project_id)\n", + "cli.export(p.project_id)\n", + "cli.delete(p.project_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/data/duplicates-deletion.json b/tests/data/cli/duplicates-deletion.json similarity index 100% rename from tests/data/duplicates-deletion.json rename to tests/data/cli/duplicates-deletion.json diff --git a/tests/data/cli/duplicates.csv b/tests/data/cli/duplicates.csv new file mode 100644 index 0000000..7a79dfe --- /dev/null +++ b/tests/data/cli/duplicates.csv @@ -0,0 +1,11 @@ +email,name,state,gender,purchase,count,date +danny.baron@example1.com,Danny Baron,CA,M,TV (UTF-8: ๐Ÿ“บ),1,"Wed, 4 Jul 2001" +melanie.white@example2.edu,Melanie White,NC,F,,1,2001-07-04T12:08:56 +danny.baron@example1.com, D. ("Tab") Baron,CA,M,Winter jacket,1,2001-07-04 +ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight,1,2001/07/04 +arthur.duff@example4.com,Arthur Duff,OR,M,Dining table,1,2001-07 +danny.baron@example1.com,Daniel Baron,,,Bike,1,2001 +jean.griffith@example5.org,Jean Griffith,WA,F,Power drill,1,2000 +melanie.white@example2.edu,Melanie White,NC,F,'iPad',1,1999 +ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier,1,1998 +arthur.duff@example4.com,Arthur Duff,OR,M,Night table,1,1997 diff --git a/tests/data/cli/duplicates.json b/tests/data/cli/duplicates.json new file mode 100644 index 0000000..d94ebea --- /dev/null +++ b/tests/data/cli/duplicates.json @@ -0,0 +1,92 @@ +[ + { + "email": "danny.baron@example1.com", + "name": "Danny Baron", + "state": "CA", + "gender": "M", + "purchase": "TV (UTF-8: ๐Ÿ“บ)", + "count": 1, + "date": "Wed, 4 Jul 2001" + }, + { + "email": "melanie.white@example2.edu", + "name": "Melanie White", + "state": "NC", + "gender": "F", + "purchase": "", + "count": 1, + "date": "2001-07-04T12:08:56" + }, + { + "email": "danny.baron@example1.com", + "name": " D.\t(\"Tab\") Baron", + "state": "CA", + "gender": "M", + "purchase": "Winter jacket", + "count": 1, + "date": "2001-07-04" + }, + { + "email": "ben.tyler@example3.org", + "name": "Ben Tyler", + "state": "NV", + "gender": "M", + "purchase": "Flashlight", + "count": 1, + "date": "2001/07/04" + }, + { + "email": "arthur.duff@example4.com", + "name": "Arthur Duff", + "state": "OR", + "gender": "M", + "purchase": "Dining table", + "count": 1, + "date": "2001-07" + }, + { + "email": "danny.baron@example1.com", + "name": "Daniel Baron", + "state": "", + "gender": "", + "purchase": "Bike", + "count": 1, + "date": 2001 + }, + { + "email": "jean.griffith@example5.org", + "name": "Jean Griffith", + "state": "WA", + "gender": "F", + "purchase": "Power drill", + "count": 1, + "date": 2000 + }, + { + "email": "melanie.white@example2.edu", + "name": "Melanie White", + "state": "NC", + "gender": "F", + "purchase": "'iPad'", + "count": 1, + "date": 1999 + }, + { + "email": "ben.morisson@example6.org", + "name": "Ben Morisson", + "state": "FL", + "gender": "M", + "purchase": "Amplifier", + "count": 1, + "date": 1998 + }, + { + "email": "arthur.duff@example4.com", + "name": "Arthur Duff", + "state": "OR", + "gender": "M", + "purchase": "Night table", + "count": 1, + "date": 1997 + } +] diff --git a/tests/data/cli/duplicates.ods b/tests/data/cli/duplicates.ods new file mode 100644 index 0000000..375a0af Binary files /dev/null and b/tests/data/cli/duplicates.ods differ diff --git a/tests/data/cli/duplicates.tsv b/tests/data/cli/duplicates.tsv new file mode 100644 index 0000000..5a9767d --- /dev/null +++ b/tests/data/cli/duplicates.tsv @@ -0,0 +1,11 @@ +email name state gender purchase count date +danny.baron@example1.com Danny Baron CA M TV (UTF-8: ๐Ÿ“บ) 1 Wed, 4 Jul 2001 +melanie.white@example2.edu Melanie White NC F 1 2001-07-04T12:08:56 +danny.baron@example1.com "D. (""Tab"") Baron" CA M Winter jacket 1 2001-07-04 +ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 +arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 +danny.baron@example1.com Daniel Baron Bike 1 2001 +jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 +melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 +ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 +arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 diff --git a/tests/data/cli/duplicates.txt b/tests/data/cli/duplicates.txt new file mode 100644 index 0000000..3d76a12 --- /dev/null +++ b/tests/data/cli/duplicates.txt @@ -0,0 +1,11 @@ +email name state gender purchase count date +danny.baron@example1.com Danny Baron CA M TV (UTF-8: ๐Ÿ“บ) 1 Wed, 4 Jul 2001 +melanie.white@example2.edu Melanie White NC F 1 2001-07-04T12:08:5 +danny.baron@example1.com D. ("Tab") Baron CA M Winter jacket 1 2001-07-04 +ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 +arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 +danny.baron@example1.com Daniel Baron Bike 1 2001 +jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 +melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 +ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 +arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 diff --git a/tests/data/cli/duplicates.xls b/tests/data/cli/duplicates.xls new file mode 100644 index 0000000..cbc2916 Binary files /dev/null and b/tests/data/cli/duplicates.xls differ diff --git a/tests/data/cli/duplicates.xlsx b/tests/data/cli/duplicates.xlsx new file mode 100644 index 0000000..30418c5 Binary files /dev/null and b/tests/data/cli/duplicates.xlsx differ diff --git a/tests/data/cli/duplicates.xml b/tests/data/cli/duplicates.xml new file mode 100644 index 0000000..2402358 --- /dev/null +++ b/tests/data/cli/duplicates.xml @@ -0,0 +1,93 @@ + + + + danny.baron@example1.com + Danny Baron + CA + M + TV (UTF-8: ๐Ÿ“บ) + 1 + Wed, 4 Jul 2001 + + + melanie.white@example2.edu + Melanie White + NC + F + <iPhone> + 1 + 2001-07-04T12:08:56 + + + danny.baron@example1.com + D. ("Tab") Baron + CA + M + Winter jacket + 1 + 2001-07-04 + + + ben.tyler@example3.org + Ben Tyler + NV + M + Flashlight + 1 + 2001/07/04 + + + arthur.duff@example4.com + Arthur Duff + OR + M + Dining table + 1 + 2001-07 + + + danny.baron@example1.com + Daniel Baron + + + Bike + 1 + 2001 + + + jean.griffith@example5.org + Jean Griffith + WA + F + Power drill + 1 + 2000 + + + melanie.white@example2.edu + Melanie White + NC + F + 'iPad' + 1 + 1999 + + + ben.morisson@example6.org + Ben Morisson + FL + M + Amplifier + 1 + 1998 + + + arthur.duff@example4.com + Arthur Duff + OR + M + Night table + 1 + 1997 + + diff --git a/tests/data/cli/duplicates.zip b/tests/data/cli/duplicates.zip new file mode 100644 index 0000000..eee03ff Binary files /dev/null and b/tests/data/cli/duplicates.zip differ diff --git a/tests/data/cli/duplicates1.xml b/tests/data/cli/duplicates1.xml new file mode 100644 index 0000000..ad678e2 --- /dev/null +++ b/tests/data/cli/duplicates1.xml @@ -0,0 +1,10 @@ + + + danny.baron@example1.com + Danny Baron + CA + M + TV (UTF-8: ๐Ÿ“บ) + 1 + Wed, 4 Jul 2001 + diff --git a/tests/data/cli/duplicates2.ods b/tests/data/cli/duplicates2.ods new file mode 100644 index 0000000..25751a8 Binary files /dev/null and b/tests/data/cli/duplicates2.ods differ diff --git a/tests/data/cli/duplicates2.xls b/tests/data/cli/duplicates2.xls new file mode 100644 index 0000000..5774b29 Binary files /dev/null and b/tests/data/cli/duplicates2.xls differ diff --git a/tests/data/cli/duplicates2.xlsx b/tests/data/cli/duplicates2.xlsx new file mode 100644 index 0000000..5cc2413 Binary files /dev/null and b/tests/data/cli/duplicates2.xlsx differ