realign to upstream

This commit is contained in:
Felix Lohmeier 2019-08-14 13:45:35 +02:00
parent ad95432fc0
commit aa5b3a4203
4 changed files with 124 additions and 42 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
""" """
Client library to communicate with a OpenRefine server. Client library to communicate with a Refine server.
""" """
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. # Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
@ -38,11 +38,11 @@ REFINE_PORT = os.environ.get('OPENREFINE_PORT', os.environ.get('GOOGLE_REFINE_PO
class RefineServer(object): class RefineServer(object):
"""Communicate with a OpenRefine server.""" """Communicate with a Refine server."""
@staticmethod @staticmethod
def url(): def url():
"""Return the URL to the OpenRefine server.""" """Return the URL to the Refine server."""
server = 'http://' + REFINE_HOST server = 'http://' + REFINE_HOST
if REFINE_PORT != '80': if REFINE_PORT != '80':
server += ':' + REFINE_PORT server += ':' + REFINE_PORT
@ -55,7 +55,7 @@ class RefineServer(object):
self.__version = None # see version @property below self.__version = None # see version @property below
def urlopen(self, command, data=None, params=None, project_id=None): def urlopen(self, command, data=None, params=None, project_id=None):
"""Open a OpenRefine URL and with optional query params and POST data. """Open a Refine URL and with optional query params and POST data.
data: POST data dict data: POST data dict
param: query params dict param: query params dict
@ -85,7 +85,7 @@ class RefineServer(object):
raise Exception('HTTP %d "%s" for %s\n\t%s' % (e.code, e.msg, e.geturl(), data)) raise Exception('HTTP %d "%s" for %s\n\t%s' % (e.code, e.msg, e.geturl(), data))
except urllib2.URLError as e: except urllib2.URLError as e:
raise urllib2.URLError( raise urllib2.URLError(
'%s for %s. No OpenRefine server reachable/running; ENV set?' % '%s for %s. No Refine server reachable/running; ENV set?' %
(e.reason, self.server)) (e.reason, self.server))
if response.info().get('Content-Encoding', None) == 'gzip': if response.info().get('Content-Encoding', None) == 'gzip':
# Need a seekable filestream for gzip # Need a seekable filestream for gzip
@ -95,7 +95,7 @@ class RefineServer(object):
return response return response
def urlopen_json(self, *args, **kwargs): def urlopen_json(self, *args, **kwargs):
"""Open a OpenRefine URL, optionally POST data, and return parsed JSON.""" """Open a Refine URL, optionally POST data, and return parsed JSON."""
response = json.loads(self.urlopen(*args, **kwargs).read()) response = json.loads(self.urlopen(*args, **kwargs).read())
if 'code' in response and response['code'] not in ('ok', 'pending'): if 'code' in response and response['code'] not in ('ok', 'pending'):
error_message = ('server ' + response['code'] + ': ' + error_message = ('server ' + response['code'] + ': ' +
@ -118,7 +118,7 @@ class RefineServer(object):
class Refine: class Refine:
"""Class representing a connection to a OpenRefine server.""" """Class representing a connection to a Refine server."""
def __init__(self, server): def __init__(self, server):
if isinstance(server, RefineServer): if isinstance(server, RefineServer):
self.server = server self.server = server
@ -144,17 +144,99 @@ class Refine:
return projects[project_id]['name'] return projects[project_id]['name']
def open_project(self, project_id): def open_project(self, project_id):
"""Open a OpenRefine project.""" """Open a Refine project."""
return RefineProject(self.server, project_id) return RefineProject(self.server, project_id)
def new_project(self, project_file=None, project_name=None, # These aren't used yet but are included for reference
project_format='', **kwargs): new_project_defaults = {
"""Create a OpenRefine project.""" 'text/line-based/*sv': {
defaults = { 'guessCellValueTypes' : False, 'headerLines' : 1, 'ignoreLines' : -1, 'includeFileSources' : False, 'limit' : -1, 'linesPerRow' : 1, 'processQuotes' : True, 'separator' : ',', 'skipDataLines' : 0, 'storeBlankCellsAsNulls' : True, 'storeBlankRows' : True, 'storeEmptyStrings' : True, 'trimStrings' : False } 'encoding': '',
'separator': ',',
'ignore_lines': -1,
'header_lines': 1,
'skip_data_lines': 0,
'limit': -1,
'store_blank_rows': True,
'guess_cell_value_types': True,
'process_quotes': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False},
'text/line-based': {
'encoding': '',
'lines_per_row': 1,
'ignore_lines': -1,
'limit': -1,
'skip_data_lines': -1,
'store_blank_rows': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False},
'text/line-based/fixed-width': {
'encoding': '',
'column_widths': [20],
'ignore_lines': -1,
'header_lines': 0,
'skip_data_lines': 0,
'limit': -1,
'guess_cell_value_types': False,
'store_blank_rows': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False},
'text/line-based/pc-axis': {
'encoding': '',
'limit': -1,
'skip_data_lines': -1,
'include_file_sources': False},
'text/rdf+n3': {'encoding': ''},
'text/xml/ods': {
'sheets': [],
'ignore_lines': -1,
'header_lines': 1,
'skip_data_lines': 0,
'limit': -1,
'store_blank_rows': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False},
'binary/xls': {
'xml_based': False,
'sheets': [],
'ignore_lines': -1,
'header_lines': 1,
'skip_data_lines': 0,
'limit': -1,
'store_blank_rows': True,
'store_blank_cells_as_nulls': True,
'include_file_sources': False}
}
def new_project(self, project_file=None, project_url=None, project_name=None,
project_format=None, **kwargs):
"""Create a Refine project."""
if (project_file and project_url) or (not project_file and not project_url):
raise ValueError('One (only) of project_file and project_url must be set')
defaults = {'guessCellValueTypes': False,
'headerLines': 1,
'ignoreLines': -1,
'includeFileSources': False,
'limit': -1,
'linesPerRow': 1,
'processQuotes': True,
'project_format': 'text/line-based/*sv',
'separator': ',',
'skipDataLines': 0,
'storeBlankCellsAsNulls': True,
'storeBlankRows': True,
'storeEmptyStrings': True,
'trimStrings': False}
# options # options
options = { 'format': project_format } options = {
if project_file is not None: 'format': project_format
}
if project_url is not None:
options['url'] = project_url
elif project_file is not None:
options['project-file'] = { options['project-file'] = {
'fd': open(project_file), 'fd': open(project_file),
'filename': project_file, 'filename': project_file,
@ -165,10 +247,10 @@ class Refine:
project_name = os.path.basename(project_name) project_name = os.path.basename(project_name)
options['project-name'] = project_name options['project-name'] = project_name
# params (the API requires a json in the 'option' POST argument) # params (the API requires a json in the 'options' POST argument)
params = defaults params_dict = dict(defaults)
params.update(kwargs) params_dict.update(kwargs)
params = { 'options': json.dumps(params) } params = { 'options': json.dumps(params_dict) }
# submit # submit
response = self.server.urlopen( response = self.server.urlopen(
@ -179,17 +261,11 @@ class Refine:
urlparse.urlparse(response.geturl()).query) urlparse.urlparse(response.geturl()).query)
if 'project' in url_params: if 'project' in url_params:
project_id = url_params['project'][0] project_id = url_params['project'][0]
# check number of rows
rows = RefineProject(RefineServer(),project_id).do_json('get-rows')['total']
if rows > 0:
print('{0}: {1}'.format('id', project_id))
print('{0}: {1}'.format('rows', rows))
return RefineProject(self.server, project_id) return RefineProject(self.server, project_id)
else:
raise Exception('Project contains 0 rows. Please check --help for mandatory arguments for xml, json, xlsx and ods')
else: else:
raise Exception('Project not created') raise Exception('Project not created')
def RowsResponseFactory(column_index): def RowsResponseFactory(column_index):
"""Factory for the parsing the output from get_rows(). """Factory for the parsing the output from get_rows().
@ -251,7 +327,7 @@ class RefineProject:
server = RefineServer(server) server = RefineServer(server)
self.server = server self.server = server
if not project_id: if not project_id:
raise Exception('Missing OpenRefine project ID') raise Exception('Missing Refine project ID')
self.project_id = project_id self.project_id = project_id
self.engine = facet.Engine() self.engine = facet.Engine()
self.sorting = facet.Sorting() self.sorting = facet.Sorting()
@ -342,11 +418,17 @@ class RefineProject:
export_format) export_format)
return self.do_raw(url, data={'format': export_format}) return self.do_raw(url, data={'format': export_format})
def export_templating(self, export_format='txt', engine='', prefix='', template='', rowSeparator='', suffix=''): def export_templating(self, export_format='txt', engine='', prefix='',
"""Return a fileobject of a project's data.""" template='', rowSeparator='', suffix=''):
"""Return a fileobject of a project's data in templating mode."""
url = ('export-rows/' + urllib.quote(self.project_name()) + '.' + url = ('export-rows/' + urllib.quote(self.project_name()) + '.' +
export_format) export_format)
return self.do_raw(url, data={'format': 'template', 'template': template, 'engine': engine, 'prefix': prefix, 'suffix': suffix, 'separator': rowSeparator } ) return self.do_raw(url, data={'format': 'template',
'template': template,
'engine': engine,
'prefix': prefix,
'suffix': suffix,
'separator': rowSeparator})
def export_rows(self, **kwargs): def export_rows(self, **kwargs):
"""Return an iterable of parsed rows of a project's data.""" """Return an iterable of parsed rows of a project's data."""

View File

@ -1,6 +1,6 @@
#! /usr/bin/env python #!/usr/bin/env python
""" """
Script to provide a command line interface to a OpenRefine server. Script to provide a command line interface to a Refine server.
""" """
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. # Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.

View File

@ -37,7 +37,7 @@ class RefineServerTest(refinetest.RefineTestCase):
self.assertTrue(item in version_info) self.assertTrue(item in version_info)
def test_version(self): def test_version(self):
self.assertTrue(self.server.version in ('2.0', '2.1', '2.5', '2.6', '2.7', '2.8')) self.assertTrue(self.server.version in ('2.0', '2.1', '2.5'))
class RefineTest(refinetest.RefineTestCase): class RefineTest(refinetest.RefineTestCase):

View File

@ -22,7 +22,7 @@ from tests import refinetest
class TutorialTestFacets(refinetest.RefineTestCase): class TutorialTestFacets(refinetest.RefineTestCase):
project_file = 'louisiana-elected-officials.csv' project_file = 'louisiana-elected-officials.csv'
project_options = {'guessCellValueTypes': True} project_options = {'guess_cell_value_types': True}
def test_get_rows(self): def test_get_rows(self):
# Section "2. Exploration using Facets": {3} # Section "2. Exploration using Facets": {3}
@ -130,7 +130,7 @@ class TutorialTestFacets(refinetest.RefineTestCase):
class TutorialTestEditing(refinetest.RefineTestCase): class TutorialTestEditing(refinetest.RefineTestCase):
project_file = 'louisiana-elected-officials.csv' project_file = 'louisiana-elected-officials.csv'
project_options = {'guessCellValueTypes': True} project_options = {'guess_cell_value_types': True}
def test_editing(self): def test_editing(self):
# Section "3. Cell Editing": {1} # Section "3. Cell Editing": {1}
@ -138,7 +138,7 @@ class TutorialTestEditing(refinetest.RefineTestCase):
# {2} # {2}
self.project.text_transform(column='Zip Code 2', self.project.text_transform(column='Zip Code 2',
expression='value.toString()[0, 5]') expression='value.toString()[0, 5]')
self.assertInResponse('transform on 6958 cells in column Zip Code 2') self.assertInResponse('transform on 6067 cells in column Zip Code 2')
# {3} - XXX history # {3} - XXX history
# {4} # {4}
office_title_facet = facet.TextFacet('Office Title') office_title_facet = facet.TextFacet('Office Title')
@ -162,8 +162,8 @@ class TutorialTestEditing(refinetest.RefineTestCase):
self.assertEqual(len(clusters), 7) self.assertEqual(len(clusters), 7)
first_cluster = clusters[0] first_cluster = clusters[0]
self.assertEqual(len(first_cluster), 2) self.assertEqual(len(first_cluster), 2)
self.assertEqual(first_cluster[0]['value'], 'DPEC Member at Large') self.assertEqual(first_cluster[0]['value'], 'RSCC Member at Large')
self.assertEqual(first_cluster[0]['count'], 6) self.assertEqual(first_cluster[0]['count'], 233)
# Not strictly necessary to repeat 'Council Member' but a test # Not strictly necessary to repeat 'Council Member' but a test
# of mass_edit, and it's also what the front end sends. # of mass_edit, and it's also what the front end sends.
self.project.mass_edit('Office Title', [{ self.project.mass_edit('Office Title', [{
@ -194,9 +194,9 @@ class TutorialTestEditing(refinetest.RefineTestCase):
# {5}, {6}, {7} # {5}, {6}, {7}
response = self.project.compute_facets(facet.StarredFacet(True)) response = self.project.compute_facets(facet.StarredFacet(True))
self.assertEqual(len(response.facets[0].choices), 2) # true & false self.assertEqual(len(response.facets[0].choices), 2) # true & false
self.assertEqual(response.facets[0].choices[True].count, 2) self.assertEqual(response.facets[0].choices[True].count, 3)
self.project.remove_rows() self.project.remove_rows()
self.assertInResponse('2 rows') self.assertInResponse('3 rows')
class TutorialTestDuplicateDetection(refinetest.RefineTestCase): class TutorialTestDuplicateDetection(refinetest.RefineTestCase):
@ -286,7 +286,7 @@ class TutorialTestTransposeFixedNumberOfRowsIntoColumns(
refinetest.RefineTestCase): refinetest.RefineTestCase):
project_file = 'fixed-rows.csv' project_file = 'fixed-rows.csv'
project_format = 'text/line-based' project_format = 'text/line-based'
project_options = {'headerLines': 0} project_options = {'header_lines': 0}
def test_transpose_fixed_number_of_rows_into_columns(self): def test_transpose_fixed_number_of_rows_into_columns(self):
if self.server.version not in ('2.0', '2.1'): if self.server.version not in ('2.0', '2.1'):
@ -360,7 +360,7 @@ class TutorialTestTransposeVariableNumberOfRowsIntoColumns(
refinetest.RefineTestCase): refinetest.RefineTestCase):
project_file = 'variable-rows.csv' project_file = 'variable-rows.csv'
project_format = 'text/line-based' project_format = 'text/line-based'
project_options = {'headerLines': 0} project_options = {'header_lines': 0}
def test_transpose_variable_number_of_rows_into_columns(self): def test_transpose_variable_number_of_rows_into_columns(self):
# {20}, {21} # {20}, {21}