From 51d2294878f7fe9af59da827a130a1c4e522dd7d Mon Sep 17 00:00:00 2001 From: Paul Makepeace Date: Tue, 26 Apr 2011 02:02:44 -0400 Subject: [PATCH] Break facet & engine out into facet.py; move refine.py into refine/refine.py. Tidy up (c) notice. --- google/refine/__init__.py | 0 google/refine/facet.py | 211 ++++++++++++++++++++++++++++++++ google/{ => refine}/refine.py | 219 ++-------------------------------- google/test/test_engine.py | 12 +- google/test/test_refine.py | 59 +++++---- 5 files changed, 255 insertions(+), 246 deletions(-) create mode 100644 google/refine/__init__.py create mode 100644 google/refine/facet.py rename google/{ => refine}/refine.py (72%) diff --git a/google/refine/__init__.py b/google/refine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/google/refine/facet.py b/google/refine/facet.py new file mode 100644 index 0000000..ea73186 --- /dev/null +++ b/google/refine/facet.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python +""" +Google Refine Facets, Engine, and Facet Responses. +""" + +# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. + +import json +import re + + +def to_camel(attr): + """convert this_attr_name to thisAttrName.""" + # Do lower case first letter + return (attr[0].lower() + + re.sub(r'_(.)', lambda x: x.group(1).upper(), attr[1:])) + +def from_camel(attr): + """convert thisAttrName to this_attr_name.""" + # Don't add an underscore for capitalized first letter + return re.sub(r'(?<=.)([A-Z])', lambda x: '_' + x.group(1), attr).lower() + + +class Facet(object): + def __init__(self, column, type, expression='value', **options): + self.type = type + self.name = column + self.column_name = column + self.expression = expression + for k, v in options.items(): + setattr(self, k, v) + + def as_dict(self): + return dict([(to_camel(k), v) for k, v in self.__dict__.items() + if v is not None]) + + +class TextFilterFacet(Facet): + def __init__(self, column, query): + super(TextFacet, self).__init__(column, type='text', mode='text', + query=query, **options) + + +class TextFacet(Facet): + def __init__(self, column, selection=None, omit_blank=False, omit_error=False, select_blank=False, select_error=False, invert=False, **options): + super(TextFacet, self).__init__( + column, + type='list', + omit_blank=omit_blank, + omit_error=omit_error, + select_blank=select_blank, + select_error=select_error, + invert=invert, + **options) + self.selection = [] + if selection is None: + selection = [] + elif not isinstance(selection, list): + selection = [selection] + for value in selection: + self.include(value) + + def include(self, value): + for s in self.selection: + if s['v']['v'] == value: + return + self.selection.append({'v': {'v': value, 'l': value}}) + return self + + def exclude(self, value): + self.selection = [s for s in self.selection + if s['v']['v'] != value] + return self + + def reset(self): + self.selection = [] + return self + + +class BoolFacet(TextFacet): + def __init__(self, column, expression=None, selection=None): + if selection is not None and not isinstance(selection, bool): + raise ValueError('selection must be True or False.') + if expression is None: + raise ValueError('Missing expression') + super(BoolFacet, self).__init__(column, + expression=expression, selection=selection) + + +class StarredFacet(BoolFacet): + def __init__(self, selection=None): + super(StarredFacet, self).__init__('', + expression='row.starred', selection=selection) + + +class FlaggedFacet(BoolFacet): + def __init__(self, selection=None): + super(FlaggedFacet, self).__init__('', + expression='row.flagged', selection=selection) + + +class BlankFacet(BoolFacet): + def __init__(self, column, selection=None): + super(BlankFacet, self).__init__(column, + expression='isBlank(value)', selection=selection) + + + +# Capitalize 'From' to get around python's reserved word. +class NumericFacet(Facet): + def __init__(self, column, From=None, to=None, select_blank=True, select_error=True, select_non_numeric=True, select_numeric=True, **options): + super(NumericFacet, self).__init__( + column, + type='range', + select_blank=select_blank, + select_error=select_error, + select_non_numeric=select_non_numeric, + select_numeric=select_numeric, + From=From, + to=to, + **options) + + +class FacetResponse(object): + def __init__(self, facet): + for k, v in facet.items(): + if isinstance(k, bool) or isinstance(k, basestring): + setattr(self, from_camel(k), v) + self.choices = {} + class FacetChoice(object): + def __init__(self, c): + self.count = c['c'] + self.selected = c['s'] + + if 'choices' in facet: + for choice in facet['choices']: + self.choices[choice['v']['v']] = FacetChoice(choice) + if 'blankChoice' in facet: + self.blank_choice = FacetChoice(facet['blankChoice']) + else: + self.blank_choice = None + if 'bins' in facet: + self.bins = facet['bins'] + self.base_bins = facet['baseBins'] + + +class FacetsResponse(object): + def __init__(self, facets): + self.facets = [FacetResponse(f) for f in facets['facets']] + self.mode = facets['mode'] + + +class Engine(object): + def __init__(self, facets=None, mode='row-based'): + if facets is None: + facets = [] + elif not isinstance(facets, list): + facets = [facets] + self.facets = facets + self.mode = mode + + def as_dict(self): + return { + 'facets': [f.as_dict() for f in self.facets], # XXX how with json? + 'mode': self.mode, + } + + def __len__(self): + return len(self.facets) + + def as_json(self): + return json.dumps(self.as_dict()) + + def add_facet(self, facet): + self.facets.append(facet) + + def remove_all(self): + self.facets = [] + + def reset_all(self): + for facet in self.facets: + facet.reset() + + +class Sorting(object): + """Class representing the current sorting order for a project. + + Used in RefineProject.get_rows()""" + def __init__(self, criteria=None): + self.criteria = [] + if criteria is None: + criteria = [] + if not isinstance(criteria, list): + criteria = [criteria] + for criterion in criteria: + if isinstance(criterion, basestring): + criterion = { + 'column': criterion, + 'valueType': 'string', + 'caseSensitive': False, + } + criterion.setdefault('reverse', False) + criterion.setdefault('errorPosition', 1) + criterion.setdefault('blankPosition', 2) + self.criteria.append(criterion) + + def as_json(self): + return json.dumps({'criteria': self.criteria}) + + def __len__(self): + return len(self.criteria) diff --git a/google/refine.py b/google/refine/refine.py similarity index 72% rename from google/refine.py rename to google/refine/refine.py index 494e8bc..57ed9a7 100644 --- a/google/refine.py +++ b/google/refine/refine.py @@ -3,6 +3,8 @@ Client library to communicate with a Refine server. """ +# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. + import csv import json import gzip @@ -14,205 +16,10 @@ import urllib2_file import urllib2 import urlparse +from google.refine import facet -REFINE_HOST = os.environ.get('GOOGLE_REFINE_HOST', '127.0.0.1') REFINE_PORT = os.environ.get('GOOGLE_REFINE_PORT', '3333') - - -def to_camel(attr): - """convert this_attr_name to thisAttrName.""" - # Do lower case first letter - return (attr[0].lower() + - re.sub(r'_(.)', lambda x: x.group(1).upper(), attr[1:])) - -def from_camel(attr): - """convert thisAttrName to this_attr_name.""" - # Don't add an underscore for capitalized first letter - return re.sub(r'(?<=.)([A-Z])', lambda x: '_' + x.group(1), attr).lower() - - -class Facet(object): - def __init__(self, column, type, expression='value', **options): - self.type = type - self.name = column - self.column_name = column - self.expression = expression - for k, v in options.items(): - setattr(self, k, v) - - def as_dict(self): - return dict([(to_camel(k), v) for k, v in self.__dict__.items() - if v is not None]) - - -class TextFacet(Facet): - def __init__(self, column, selection=None, omit_blank=False, omit_error=False, select_blank=False, select_error=False, invert=False, **options): - super(TextFacet, self).__init__( - column, - type='list', - omit_blank=omit_blank, - omit_error=omit_error, - select_blank=select_blank, - select_error=select_error, - invert=invert, - **options) - self.selection = [] - if selection is None: - selection = [] - elif not isinstance(selection, list): - selection = [selection] - for value in selection: - self.include(value) - - def include(self, value): - for s in self.selection: - if s['v']['v'] == value: - return - self.selection.append({'v': {'v': value, 'l': value}}) - return self - - def exclude(self, value): - self.selection = [s for s in self.selection - if s['v']['v'] != value] - return self - - def reset(self): - self.selection = [] - return self - - -class BoolFacet(TextFacet): - def __init__(self, column, expression=None, selection=None): - if selection is not None and not isinstance(selection, bool): - raise ValueError('selection must be True or False.') - if expression is None: - raise ValueError('Missing expression') - super(BoolFacet, self).__init__(column, - expression=expression, selection=selection) - - -class StarredFacet(BoolFacet): - def __init__(self, selection=None): - super(StarredFacet, self).__init__('', - expression='row.starred', selection=selection) - - -class FlaggedFacet(BoolFacet): - def __init__(self, selection=None): - super(FlaggedFacet, self).__init__('', - expression='row.flagged', selection=selection) - - -class BlankFacet(BoolFacet): - def __init__(self, column, selection=None): - super(BlankFacet, self).__init__(column, - expression='isBlank(value)', selection=selection) - - -# Capitalize 'From' to get around python's reserved word. -class NumericFacet(Facet): - def __init__(self, column, From=None, to=None, select_blank=True, select_error=True, select_non_numeric=True, select_numeric=True, **options): - super(NumericFacet, self).__init__( - column, - type='range', - select_blank=select_blank, - select_error=select_error, - select_non_numeric=select_non_numeric, - select_numeric=select_numeric, - From=From, - to=to, - **options) - - -class FacetResponse(object): - def __init__(self, facet): - for k, v in facet.items(): - if isinstance(k, bool) or isinstance(k, basestring): - setattr(self, from_camel(k), v) - self.choices = {} - class FacetChoice(object): - def __init__(self, c): - self.count = c['c'] - self.selected = c['s'] - - if 'choices' in facet: - for choice in facet['choices']: - self.choices[choice['v']['v']] = FacetChoice(choice) - if 'blankChoice' in facet: - self.blank_choice = FacetChoice(facet['blankChoice']) - else: - self.blank_choice = None - if 'bins' in facet: - self.bins = facet['bins'] - self.base_bins = facet['baseBins'] - - -class FacetsResponse(object): - def __init__(self, facets): - self.facets = [FacetResponse(f) for f in facets['facets']] - self.mode = facets['mode'] - - -class Engine(object): - def __init__(self, facets=None, mode='row-based'): - if facets is None: - facets = [] - elif not isinstance(facets, list): - facets = [facets] - self.facets = facets - self.mode = mode - - def as_dict(self): - return { - 'facets': [f.as_dict() for f in self.facets], # XXX how with json? - 'mode': self.mode, - } - - def __len__(self): - return len(self.facets) - - def as_json(self): - return json.dumps(self.as_dict()) - - def add_facet(self, facet): - self.facets.append(facet) - - def remove_all(self): - self.facets = [] - - def reset_all(self): - for facet in self.facets: - facet.reset() - - -class Sorting(object): - """Class representing the current sorting order for a project. - - Used in RefineProject.get_rows()""" - def __init__(self, criteria=None): - self.criteria = [] - if criteria is None: - criteria = [] - if not isinstance(criteria, list): - criteria = [criteria] - for criterion in criteria: - if isinstance(criterion, basestring): - criterion = { - 'column': criterion, - 'valueType': 'string', - 'caseSensitive': False, - } - criterion.setdefault('reverse', False) - criterion.setdefault('errorPosition', 1) - criterion.setdefault('blankPosition', 2) - self.criteria.append(criterion) - - def as_json(self): - return json.dumps({'criteria': self.criteria}) - - def __len__(self): - return len(self.criteria) - +REFINE_HOST = os.environ.get('GOOGLE_REFINE_HOST', '127.0.0.1') class RefineServer(object): """Communicate with a Refine server.""" @@ -412,8 +219,8 @@ class RefineProject: project_name or project_id) self.project_id = project_id self.project_name = project_name - self.engine = Engine() - self.sorting = Sorting() + self.engine = facet.Engine() + self.sorting = facet.Sorting() # following filled in by get_models() self.has_records = False self.column_order = {} # order of column in UI @@ -484,31 +291,31 @@ class RefineProject: def compute_facets(self, facets=None): if facets: - self.engine = Engine(facets) + self.engine = facet.Engine(facets) response = self.do_json('compute-facets') - return FacetsResponse(response) + return facet.FacetsResponse(response) def get_rows(self, facets=None, sort_by=None, start=0, limit=10): if facets: - self.engine = Engine(facets) + self.engine = facet.Engine(facets) if sort_by is not None: - self.sorting = Sorting(sort_by) + self.sorting = facet.Sorting(sort_by) response = self.do_json('get-rows', {'sorting': self.sorting.as_json(), 'start': start, 'limit': limit}) return self.rows_response_factory(response) def reorder_rows(self, sort_by=None): if sort_by is not None: - self.sorting = Sorting(sort_by) + self.sorting = facet.Sorting(sort_by) response = self.do_json('reorder-rows', {'sorting': self.sorting.as_json()}) # clear sorting - self.sorting = Sorting() + self.sorting = facet.Sorting() return response def remove_rows(self, facets=None): if facets: - self.engine = Engine(facets) + self.engine = facet.Engine(facets) return self.do_json('remove-rows') def text_transform(self, column, expression, on_error='set-to-blank', diff --git a/google/test/test_engine.py b/google/test/test_engine.py index f94cf5c..bb32c3c 100644 --- a/google/test/test_engine.py +++ b/google/test/test_engine.py @@ -1,20 +1,14 @@ #!/usr/bin/env python -# encoding: utf-8 """ test_engine.py - -Created by Paul Makepeace on 2011-04-22. -Copyright (c) 2011 Real Programmers. All rights reserved. """ +# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. import json -import os -import sys import unittest -import urllib -from google.refine import TextFacet, NumericFacet, StarredFacet, FlaggedFacet -from google.refine import Engine, Sorting, FacetsResponse + +from google.refine.facet import * class FacetTest(unittest.TestCase): diff --git a/google/test/test_refine.py b/google/test/test_refine.py index f8c8c7a..d059470 100644 --- a/google/test/test_refine.py +++ b/google/test/test_refine.py @@ -1,20 +1,15 @@ #!/usr/bin/env python -# encoding: utf-8 """ test_refine.py - -Created by Paul Makepeace on 2011-04-22. -Copyright (c) 2011 Real Programmers. All rights reserved. """ -import sys +# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. + import os import unittest -from google.refine import REFINE_HOST, REFINE_PORT -from google.refine import NumericFacet, TextFacet -from google.refine import BlankFacet, StarredFacet, Engine -from google.refine import RefineServer, Refine, RefineProject -from google.refine import to_camel, from_camel + +from google.refine import refine +from google.refine import facet PATH_TO_TEST_DATA = os.path.join('google', 'test', 'data') @@ -27,7 +22,7 @@ class CamelTest(unittest.TestCase): ('From', 'from'), ) for attr, camel_attr in pairs: - self.assertEqual(to_camel(attr), camel_attr) + self.assertEqual(facet.to_camel(attr), camel_attr) def test_from_camel(self): pairs = ( @@ -38,7 +33,7 @@ class CamelTest(unittest.TestCase): ('From', 'from'), ) for camel_attr, attr in pairs: - self.assertEqual(from_camel(camel_attr), attr) + self.assertEqual(facet.from_camel(camel_attr), attr) class RefineTestCase(unittest.TestCase): @@ -47,8 +42,8 @@ class RefineTestCase(unittest.TestCase): project = None # Section "2. Exploration using Facets": {1}, {2} def setUp(self): - self.server = RefineServer() - self.refine = Refine(self.server) + self.server = refine.RefineServer() + self.refine = refine.Refine(self.server) if self.project_file: self.project = self.refine.new_project( os.path.join(PATH_TO_TEST_DATA, self.project_file), @@ -62,8 +57,9 @@ class RefineTestCase(unittest.TestCase): class RefineServerTest(RefineTestCase): def test_init(self): - self.assertEqual(self.server.server, 'http://%s:%s' % (REFINE_HOST, REFINE_PORT)) - server = RefineServer('http://refine.example/') + self.assertEqual(self.server.server, + 'http://%s:%s' % (refine.REFINE_HOST, refine.REFINE_PORT)) + server = refine.RefineServer('http://refine.example/') self.assertEqual(server.server, 'http://refine.example') def test_list_projects(self): @@ -80,7 +76,7 @@ class RefineTest(RefineTestCase): project_file = 'duplicates.csv' def test_new_project(self): - self.assertTrue(isinstance(self.project, RefineProject)) + self.assertTrue(isinstance(self.project, refine.RefineProject)) def test_get_models(self): self.assertEqual(self.project.key_column, 'email') @@ -106,7 +102,7 @@ class TutorialTestFacets(RefineTestCase): def test_facet(self): # Section "2. Exploration using Facets": {4} - party_code_facet = TextFacet(column='Party Code') + party_code_facet = facet.TextFacet(column='Party Code') response = self.project.compute_facets(party_code_facet) pc = response.facets[0] self.assertEqual(pc.name, 'Party Code') @@ -114,8 +110,8 @@ class TutorialTestFacets(RefineTestCase): self.assertEqual(pc.choices['N'].count, 15) self.assertEqual(pc.blank_choice.count, 1446) # {5}, {6} - engine = Engine(party_code_facet) - ethnicity_facet = TextFacet(column='Ethnicity') + engine = facet.Engine(party_code_facet) + ethnicity_facet = facet.TextFacet(column='Ethnicity') engine.add_facet(ethnicity_facet) self.project.engine = engine response = self.project.compute_facets() @@ -146,12 +142,12 @@ class TutorialTestFacets(RefineTestCase): response = self.project.get_rows() self.assertEqual(response.filtered, 6958) # {11} - office_title_facet = TextFacet('Office Title') + office_title_facet = facet.TextFacet('Office Title') self.project.engine.add_facet(office_title_facet) response = self.project.compute_facets() self.assertEqual(len(response.facets[2].choices), 76) # {12} - XXX not sure how to interpret bins & baseBins yet - office_level_facet = NumericFacet('Office Level') + office_level_facet = facet.NumericFacet('Office Level') self.project.engine.add_facet(office_level_facet) # {13} office_level_facet.From = 300 # from reserved word @@ -168,14 +164,14 @@ class TutorialTestFacets(RefineTestCase): response = self.project.get_rows() self.assertEqual(response.filtered, 6958) # {15} - phone_facet = TextFacet('Phone', expression='value[0, 3]') + phone_facet = facet.TextFacet('Phone', expression='value[0, 3]') self.project.engine.add_facet(phone_facet) response = self.project.compute_facets() p = response.facets[0] self.assertEqual(p.expression, 'value[0, 3]') self.assertEqual(p.choices['318'].count, 2331) # {16} - commissioned_date_facet = NumericFacet('Commissioned Date', + commissioned_date_facet = facet.NumericFacet('Commissioned Date', expression='value.toDate().datePart("year")') self.project.engine.add_facet(commissioned_date_facet) response = self.project.compute_facets() @@ -183,7 +179,7 @@ class TutorialTestFacets(RefineTestCase): self.assertEqual(cd.error_count, 959) self.assertEqual(cd.numeric_count, 5999) # {17} - office_description_facet = NumericFacet('Office Description', + office_description_facet = facet.NumericFacet('Office Description', expression=r'value.match(/\D*(\d+)\w\w Rep.*/)[0].toNumber()') self.project.engine.add_facet(office_description_facet) response = self.project.compute_facets() @@ -205,7 +201,7 @@ class TutorialTestEditing(RefineTestCase): self.assertTrue('6067' in response['historyEntry']['description']) # {3} - XXX history # {4} - office_title_facet = TextFacet('Office Title') + office_title_facet = facet.TextFacet('Office Title') self.project.engine.add_facet(office_title_facet) response = self.project.compute_facets() self.assertEqual(len(response.facets[0].choices), 76) @@ -250,14 +246,14 @@ class TutorialTestEditing(RefineTestCase): # {2} if match['value'].endswith(', '): response = self.project.get_rows( - TextFacet('Candidate Name', match['value'])) + facet.TextFacet('Candidate Name', match['value'])) self.assertEqual(len(response.rows), 1) for row in response.rows: response = self.project.star_row(row) self.assertTrue(str(row.index + 1) in response['historyEntry']['description']) # {5}, {6}, {7} - response = self.project.compute_facets(StarredFacet(True)) + response = self.project.compute_facets(facet.StarredFacet(True)) self.assertEqual(len(response.facets[0].choices), 2) # true & false self.assertEqual(response.facets[0].choices[True].count, 3) response = self.project.remove_rows() @@ -301,7 +297,7 @@ class TutorialTestDuplicateDetection(RefineTestCase): emails = [1 if r['email'] else 0 for r in response.rows] self.assertEqual(emails, [1, 0, 1, 1, 1, 0, 0, 1, 1, 0]) # {12} - blank_facet = BlankFacet('email', selection=True) + blank_facet = facet.BlankFacet('email', selection=True) # {13} response = self.project.remove_rows(blank_facet) self.assertTrue('Remove 4 rows' in @@ -379,7 +375,8 @@ class TutorialTestTransposeFixedNumbeOfRowsIntoColumns(RefineTestCase): self.assertTrue('Column 1 by filling 4 rows' in response['historyEntry']['description']) # {11} - transaction_facet = TextFacet(column='Transaction', selection='send') + transaction_facet = facet.TextFacet(column='Transaction', + selection='send') self.project.engine.add_facet(transaction_facet) self.project.compute_facets() # {12}, {13}, {14} @@ -467,7 +464,7 @@ class TutorialTestTransposeVariableNumbeOfRowsIntoColumns(RefineTestCase): # {26} self.project.engine.mode = 'row-based' # {27} - blank_facet = BlankFacet('First Line', selection=True) + blank_facet = facet.BlankFacet('First Line', selection=True) response = self.project.remove_rows(blank_facet) self.assertEqual('Remove 14 rows', response['historyEntry']['description'])