realigned create/new_project to upstream

new feature: xml root element will be discovered if recordPath is not set
bugfix: newly introduced option projectTags was not working in 0.3.7
bugfix: txt defaulted to fixed-width (should be line-based)
bugfix: default recordPath for json was not working in 0.3.7
bugfix: default sheets option was broken (but xls, xlsx, ods is broken in OpenRefine >=2.8 anyway, see #4)
tests: added sample files and an ipython notebook for comprehensive tests of create option
This commit is contained in:
Felix Lohmeier 2019-08-20 04:30:50 +02:00
parent 7ad79af3ca
commit 375ac42be0
18 changed files with 2749 additions and 38 deletions

View File

@ -145,7 +145,7 @@ group5.add_option('--projectTags', dest='projectTags',
help='(all formats), please provide tags in multiple arguments, e.g. --projectTags=beta --projectTags=client1') help='(all formats), please provide tags in multiple arguments, e.g. --projectTags=beta --projectTags=client1')
group5.add_option('--recordPath', dest='recordPath', group5.add_option('--recordPath', dest='recordPath',
action='append', action='append',
help='(xml,json), please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record, default xml: record, default json: _ _') help='(xml,json), please provide path in multiple arguments, e.g. /collection/record/ should be entered: --recordPath=collection --recordPath=record, default xml: root element, default json: _ _')
group5.add_option('--separator', dest='separator', group5.add_option('--separator', dest='separator',
help='(csv,tsv), default csv: , default tsv: \\t') help='(csv,tsv), default csv: , default tsv: \\t')
group5.add_option('--sheets', dest='sheets', group5.add_option('--sheets', dest='sheets',

View File

@ -25,6 +25,7 @@ import ssl
import sys import sys
import time import time
import urllib import urllib
from xml.etree import ElementTree
from google.refine import refine from google.refine import refine
@ -43,7 +44,6 @@ def apply(project_id, history_file):
def create(project_file, def create(project_file,
project_format=None, project_format=None,
project_name=None,
columnWidths=None, columnWidths=None,
encoding=None, encoding=None,
guessCellValueTypes=False, guessCellValueTypes=False,
@ -54,6 +54,7 @@ def create(project_file,
linesPerRow=None, linesPerRow=None,
processQuotes=True, processQuotes=True,
projectName=None, projectName=None,
projectTags=None,
recordPath=None, recordPath=None,
separator=None, separator=None,
sheets=None, sheets=None,
@ -69,15 +70,15 @@ def create(project_file,
project_format = os.path.splitext(project_file)[1][1:].lower() project_format = os.path.splitext(project_file)[1][1:].lower()
if project_format == 'txt': if project_format == 'txt':
try: try:
columnWidths columnWidths[0]
project_format = 'fixed-width' project_format = 'fixed-width'
except NameError: except TypeError:
project_format = 'line-based' project_format = 'line-based'
# defaults for each file type # defaults for each file type
if project_format == 'xml': if project_format == 'xml':
project_format = 'text/xml' project_format = 'text/xml'
if not recordPath: if not recordPath:
recordPath = 'record' recordPath = [ElementTree.parse(project_file).getroot().tag]
elif project_format == 'csv': elif project_format == 'csv':
project_format = 'text/line-based/*sv' project_format = 'text/line-based/*sv'
elif project_format == 'tsv': elif project_format == 'tsv':
@ -95,22 +96,35 @@ def create(project_file,
elif project_format == 'json': elif project_format == 'json':
project_format = 'text/json' project_format = 'text/json'
if not recordPath: if not recordPath:
recordPath = ('_', '_') recordPath = ['_', '_']
elif project_format == 'xls': elif project_format == 'xls':
project_format = 'binary/text/xml/xls/xlsx' project_format = 'binary/text/xml/xls/xlsx'
if not sheets: if not sheets:
sheets = 0 sheets = [0]
# TODO: new format for sheets option introduced in OpenRefine 2.8
elif project_format == 'xlsx': elif project_format == 'xlsx':
project_format = 'binary/text/xml/xls/xlsx' project_format = 'binary/text/xml/xls/xlsx'
if not sheets: if not sheets:
sheets = 0 sheets = [0]
# TODO: new format for sheets option introduced in OpenRefine 2.8
elif project_format == 'ods': elif project_format == 'ods':
project_format = 'text/xml/ods' project_format = 'text/xml/ods'
if not sheets: if not sheets:
sheets = 0 sheets = [0]
# TODO: new format for sheets option introduced in OpenRefine 2.8
# execute # execute
kwargs = {k: v for k, v in vars().items() if v is not None} kwargs = {k: v for k, v in vars().items() if v is not None}
project = refine.Refine(refine.RefineServer()).new_project(**kwargs) project = refine.Refine(refine.RefineServer()).new_project(
guess_cell_value_types=guessCellValueTypes,
ignore_lines=ignoreLines,
header_lines=headerLines,
skip_data_lines=skipDataLines,
store_blank_rows=storeBlankRows,
process_quotes=processQuotes,
project_name=projectName,
store_blank_cells_as_nulls=storeBlankCellsAsNulls,
include_file_sources=includeFileSources,
**kwargs)
rows = project.do_json('get-rows')['total'] rows = project.do_json('get-rows')['total']
if rows > 0: if rows > 0:
print('{0}: {1}'.format('id', project.project_id)) print('{0}: {1}'.format('id', project.project_id))

View File

@ -147,41 +147,127 @@ class Refine:
"""Open a Refine project.""" """Open a Refine project."""
return RefineProject(self.server, project_id) return RefineProject(self.server, project_id)
def new_project(self, project_file=None, project_name=None, # These aren't used yet but are included for reference
project_format='text/line-based/*sv', **kwargs): new_project_defaults = {
"""Create a Refine project.""" 'text/line-based/*sv': {
'encoding': '',
defaults = {'guessCellValueTypes': False,
'headerLines': 1,
'ignoreLines': -1,
'includeFileSources': False,
'limit': -1,
'linesPerRow': 1,
'processQuotes': True,
'separator': ',', 'separator': ',',
'skipDataLines': 0, 'ignore_lines': -1,
'storeBlankCellsAsNulls': True, 'header_lines': 1,
'storeBlankRows': True, 'skip_data_lines': 0,
'storeEmptyStrings': True, 'limit': -1,
'trimStrings': False} 'store_blank_rows': True,
'guess_cell_value_types': True,
'process_quotes': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False},
'text/line-based': {
'encoding': '',
'lines_per_row': 1,
'ignore_lines': -1,
'limit': -1,
'skip_data_lines': -1,
'store_blank_rows': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False},
'text/line-based/fixed-width': {
'encoding': '',
'column_widths': [20],
'ignore_lines': -1,
'header_lines': 0,
'skip_data_lines': 0,
'limit': -1,
'guess_cell_value_types': False,
'store_blank_rows': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False},
'text/line-based/pc-axis': {
'encoding': '',
'limit': -1,
'skip_data_lines': -1,
'include_file_sources': False},
'text/rdf+n3': {'encoding': ''},
'text/xml/ods': {
'sheets': [],
'ignore_lines': -1,
'header_lines': 1,
'skip_data_lines': 0,
'limit': -1,
'store_blank_rows': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False},
'binary/xls': {
'xml_based': False,
'sheets': [],
'ignore_lines': -1,
'header_lines': 1,
'skip_data_lines': 0,
'limit': -1,
'store_blank_rows': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False}
}
# options def new_project(self, project_file=None, project_url=None, project_name=None, project_format='text/line-based/*sv',
options = {'format': project_format} encoding='',
if project_file is not None: separator=',',
options['project-file'] = {'fd': open(project_file), ignore_lines=-1,
'filename': project_file} header_lines=1,
skip_data_lines=0,
limit=-1,
store_blank_rows=True,
guess_cell_value_types=False,
process_quotes=True,
store_blank_cells_as_nulls=True,
include_file_sources=False,
**opts):
if (project_file and project_url) or (not project_file and not project_url):
raise ValueError('One (only) of project_file and project_url must be set')
def s(opt):
if isinstance(opt, bool):
return 'true' if opt else 'false'
if opt is None:
return ''
return str(opt)
# the new APIs requires a json in the 'option' POST or GET argument
# POST is broken at the moment, so we send it in the URL
new_style_options = dict(opts, **{
'encoding': s(encoding),
})
params = {
'options': json.dumps(new_style_options),
}
# old style options
options = {
'format': project_format,
'separator': s(separator),
'ignore-lines': s(ignore_lines),
'header-lines': s(header_lines),
'skip-data-lines': s(skip_data_lines),
'limit': s(limit),
'guess-value-type': s(guess_cell_value_types),
'process-quotes': s(process_quotes),
'store-blank-rows': s(store_blank_rows),
'store-blank-cells-as-nulls': s(store_blank_cells_as_nulls),
'include-file-sources': s(include_file_sources),
}
if project_url is not None:
options['url'] = project_url
elif project_file is not None:
options['project-file'] = {
'fd': open(project_file),
'filename': project_file,
}
if project_name is None: if project_name is None:
# make a name for itself by stripping extension and directories # make a name for itself by stripping extension and directories
project_name = (project_file or 'New project').rsplit('.', 1)[0] project_name = (project_file or 'New project').rsplit('.', 1)[0]
project_name = os.path.basename(project_name) project_name = os.path.basename(project_name)
options['project-name'] = project_name options['project-name'] = project_name
# params
params_dict = dict(defaults)
params_dict.update(kwargs)
params = {'options': json.dumps(params_dict)}
# submit
response = self.server.urlopen( response = self.server.urlopen(
'create-project-from-upload', options, params 'create-project-from-upload', options, params
) )

2383
tests/cli_create.ipynb Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,11 @@
email,name,state,gender,purchase,count,date
danny.baron@example1.com,Danny Baron,CA,M,TV (UTF-8: 📺),1,"Wed, 4 Jul 2001"
melanie.white@example2.edu,Melanie White,NC,F,<iPhone>,1,2001-07-04T12:08:56
danny.baron@example1.com, D. ("Tab") Baron,CA,M,Winter jacket,1,2001-07-04
ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight,1,2001/07/04
arthur.duff@example4.com,Arthur Duff,OR,M,Dining table,1,2001-07
danny.baron@example1.com,Daniel Baron,,,Bike,1,2001
jean.griffith@example5.org,Jean Griffith,WA,F,Power drill,1,2000
melanie.white@example2.edu,Melanie White,NC,F,'iPad',1,1999
ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier,1,1998
arthur.duff@example4.com,Arthur Duff,OR,M,Night table,1,1997
Can't render this file because it contains an unexpected character in line 4 and column 33.

View File

@ -0,0 +1,92 @@
[
{
"email": "danny.baron@example1.com",
"name": "Danny Baron",
"state": "CA",
"gender": "M",
"purchase": "TV (UTF-8: 📺)",
"count": 1,
"date": "Wed, 4 Jul 2001"
},
{
"email": "melanie.white@example2.edu",
"name": "Melanie White",
"state": "NC",
"gender": "F",
"purchase": "<iPhone>",
"count": 1,
"date": "2001-07-04T12:08:56"
},
{
"email": "danny.baron@example1.com",
"name": " D.\t(\"Tab\") Baron",
"state": "CA",
"gender": "M",
"purchase": "Winter jacket",
"count": 1,
"date": "2001-07-04"
},
{
"email": "ben.tyler@example3.org",
"name": "Ben Tyler",
"state": "NV",
"gender": "M",
"purchase": "Flashlight",
"count": 1,
"date": "2001/07/04"
},
{
"email": "arthur.duff@example4.com",
"name": "Arthur Duff",
"state": "OR",
"gender": "M",
"purchase": "Dining table",
"count": 1,
"date": "2001-07"
},
{
"email": "danny.baron@example1.com",
"name": "Daniel Baron",
"state": "",
"gender": "",
"purchase": "Bike",
"count": 1,
"date": 2001
},
{
"email": "jean.griffith@example5.org",
"name": "Jean Griffith",
"state": "WA",
"gender": "F",
"purchase": "Power drill",
"count": 1,
"date": 2000
},
{
"email": "melanie.white@example2.edu",
"name": "Melanie White",
"state": "NC",
"gender": "F",
"purchase": "'iPad'",
"count": 1,
"date": 1999
},
{
"email": "ben.morisson@example6.org",
"name": "Ben Morisson",
"state": "FL",
"gender": "M",
"purchase": "Amplifier",
"count": 1,
"date": 1998
},
{
"email": "arthur.duff@example4.com",
"name": "Arthur Duff",
"state": "OR",
"gender": "M",
"purchase": "Night table",
"count": 1,
"date": 1997
}
]

Binary file not shown.

View File

@ -0,0 +1,11 @@
email name state gender purchase count date
danny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001
melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:56
danny.baron@example1.com "D. (""Tab"") Baron" CA M Winter jacket 1 2001-07-04
ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04
arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07
danny.baron@example1.com Daniel Baron Bike 1 2001
jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000
melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999
ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998
arthur.duff@example4.com Arthur Duff OR M Night table 1 1997
1 email name state gender purchase count date
2 danny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001
3 melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:56
4 danny.baron@example1.com D. ("Tab") Baron CA M Winter jacket 1 2001-07-04
5 ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04
6 arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07
7 danny.baron@example1.com Daniel Baron Bike 1 2001
8 jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000
9 melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999
10 ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998
11 arthur.duff@example4.com Arthur Duff OR M Night table 1 1997

View File

@ -0,0 +1,11 @@
email name state gender purchase count date
danny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001
melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:5
danny.baron@example1.com D. ("Tab") Baron CA M Winter jacket 1 2001-07-04
ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04
arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07
danny.baron@example1.com Daniel Baron Bike 1 2001
jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000
melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999
ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998
arthur.duff@example4.com Arthur Duff OR M Night table 1 1997

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,93 @@
<?xml version="1.0" encoding="UTF-8"?>
<root>
<record>
<email>danny.baron@example1.com</email>
<name>Danny Baron</name>
<state>CA</state>
<gender>M</gender>
<purchase>TV (UTF-8: 📺)</purchase>
<count>1</count>
<date>Wed, 4 Jul 2001</date>
</record>
<record>
<email>melanie.white@example2.edu</email>
<name>Melanie White</name>
<state>NC</state>
<gender>F</gender>
<purchase>&lt;iPhone&gt;</purchase>
<count>1</count>
<date>2001-07-04T12:08:56</date>
</record>
<record>
<email>danny.baron@example1.com</email>
<name> D. (&quot;Tab&quot;) Baron</name>
<state>CA</state>
<gender>M</gender>
<purchase>Winter jacket</purchase>
<count>1</count>
<date>2001-07-04</date>
</record>
<record>
<email>ben.tyler@example3.org</email>
<name>Ben Tyler</name>
<state>NV</state>
<gender>M</gender>
<purchase>Flashlight</purchase>
<count>1</count>
<date>2001/07/04</date>
</record>
<record>
<email>arthur.duff@example4.com</email>
<name>Arthur Duff</name>
<state>OR</state>
<gender>M</gender>
<purchase>Dining table</purchase>
<count>1</count>
<date>2001-07</date>
</record>
<record>
<email>danny.baron@example1.com</email>
<name>Daniel Baron</name>
<state></state>
<gender></gender>
<purchase>Bike</purchase>
<count>1</count>
<date>2001</date>
</record>
<record>
<email>jean.griffith@example5.org</email>
<name>Jean Griffith</name>
<state>WA</state>
<gender>F</gender>
<purchase>Power drill</purchase>
<count>1</count>
<date>2000</date>
</record>
<record>
<email>melanie.white@example2.edu</email>
<name>Melanie White</name>
<state>NC</state>
<gender>F</gender>
<purchase>&apos;iPad&apos;</purchase>
<count>1</count>
<date>1999</date>
</record>
<record>
<email>ben.morisson@example6.org</email>
<name>Ben Morisson</name>
<state>FL</state>
<gender>M</gender>
<purchase>Amplifier</purchase>
<count>1</count>
<date>1998</date>
</record>
<record>
<email>arthur.duff@example4.com</email>
<name>Arthur Duff</name>
<state>OR</state>
<gender>M</gender>
<purchase>Night table</purchase>
<count>1</count>
<date>1997</date>
</record>
</root>

Binary file not shown.

View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<record>
<email>danny.baron@example1.com</email>
<name>Danny Baron</name>
<state>CA</state>
<gender>M</gender>
<purchase>TV (UTF-8: 📺)</purchase>
<count>1</count>
<date>Wed, 4 Jul 2001</date>
</record>

Binary file not shown.

Binary file not shown.

Binary file not shown.